From a5dc818e19f314f6e0990fd89913c0f8911cce66 Mon Sep 17 00:00:00 2001
From: jeremy <jeremy.bailleux@memgraph.io>
Date: Wed, 12 Oct 2022 16:35:47 +0200
Subject: [PATCH 01/38] Add new dataset for mgbench

---
 tests/mgbench/dataset_creator.py |  94 +++++++++++
 tests/mgbench/datasets.py        | 272 +++++++++++++++++++++----------
 2 files changed, 277 insertions(+), 89 deletions(-)
 create mode 100644 tests/mgbench/dataset_creator.py

diff --git a/tests/mgbench/dataset_creator.py b/tests/mgbench/dataset_creator.py
new file mode 100644
index 000000000..432a44ec7
--- /dev/null
+++ b/tests/mgbench/dataset_creator.py
@@ -0,0 +1,94 @@
+# Copyright 2021 Memgraph Ltd.
+#
+# Use of this software is governed by the Business Source License
+# included in the file licenses/BSL.txt; by using this file, you agree to be bound by the terms of the Business Source
+# License, and you may not use this file except in compliance with the Business Source License.
+#
+# As of the Change Date specified in that file, in accordance with
+# the Business Source License, use of this software will be governed
+# by the Apache License, Version 2.0, included in the file
+# licenses/APL.txt.
+
+import random
+
+import helpers
+
+# Explaination of datasets:
+#   - empty_only_index: contains index; contains no data
+#   - small: contains index; contains data (small dataset)
+#
+# Datamodel is as follow:
+#
+#                               ┌──────────────┐
+#                               │ Permission   │
+#         ┌────────────────┐    │  Schema:uuid │   ┌────────────┐
+#         │:IS_FOR_IDENTITY├────┤  Index:name  ├───┤:IS_FOR_USER│
+#         └┬───────────────┘    └──────────────┘   └────────────┤
+#          │                                                    │
+#   ┌──────▼──────────────┐                                  ┌──▼───────────┐
+#   │  Identity           │                                  │ User         │
+#   │   Schema:uuid       │                                  │  Schema:uuid │
+#   │   Index:platformId  │                                  │  Index:email │
+#   │   Index:name        │                                  └──────────────┘
+#   └─────────────────────┘
+#
+#
+#   - User: attributes: ["uuid", "name", "platformId"]
+#   - Permission: attributes: ["uuid", "name"]
+#   - Identity: attributes: ["uuid", "email"]
+#
+# Indexes:
+#   - User: [User(uuid), User(platformId), User(name)]
+#   - Permission: [Permission(uuid), Permission(name)]
+#   - Identity: [Identity(uuid), Identity(email)]
+#
+# Edges:
+#   - (:Permission)-[:IS_FOR_USER]->(:User)
+#   - (:Permission)-[:IS_FOR_IDENTITYR]->(:Identity)
+#
+# Distributed specific: uuid is the schema
+
+filename = "dataset.cypher"
+f = open(filename, "x")
+
+f.write("MATCH (n) DETACH DELETE n;\n")
+
+# Create the indexes
+f.write("CREATE INDEX ON :User;\n")
+f.write("CREATE INDEX ON :Permission;\n")
+f.write("CREATE INDEX ON :Identity;\n")
+f.write("CREATE INDEX ON :User(platformId);\n")
+f.write("CREATE INDEX ON :User(name);\n")
+f.write("CREATE INDEX ON :Permission(name);\n")
+f.write("CREATE INDEX ON :Identity(email);\n")
+
+# Create extra index: in distributed, this will be the schema
+f.write("CREATE INDEX ON :User(uuid);\n")
+f.write("CREATE INDEX ON :Permission(uuid);\n")
+f.write("CREATE INDEX ON :Identity(uuid);\n")
+
+platform_ids = [f"somePlatformId_{id}" for id in range(10)]
+
+# This is the number of clusters to change if you want a bigger dataset
+number_of_clusters = 3000000
+
+for index in range(1, number_of_clusters + 1):
+    platform_id = platform_ids[random.randint(0, len(platform_ids) - 1)]
+    user_uuid = index
+    platform_uuid = number_of_clusters + index
+    identity_uuid = 2 * number_of_clusters + index
+
+    # Create the nodes
+    f.write(f'CREATE (:User {{uuid: {user_uuid}, platformId: "{platform_id}", name: "name_user_{user_uuid}"}});\n')
+    f.write(f'CREATE (:Permission {{uuid: {platform_uuid}, name: "name_permission_{platform_uuid}"}});\n')
+    f.write(f'CREATE (:Permission {{uuid: {identity_uuid}, name: "mail_{identity_uuid}@something.com"}});\n')
+
+    # Create the edges
+    f.write(
+        f"MATCH (permission:Permission {{uuid: {platform_uuid}}}), (user:User {{uuid: {user_uuid}}}) CREATE (permission)-[e: IS_FOR_USER]->(user);\n"
+    )
+    f.write(
+        f"MATCH (permission:Permission {{uuid: {platform_uuid}}}), (identity:Identity {{uuid: {identity_uuid}}}) CREATE (permission)-[e: IS_FOR_IDENTITY]->(identity);\n"
+    )
+
+f.close()
diff --git a/tests/mgbench/datasets.py b/tests/mgbench/datasets.py
index dbaaa2de9..45fdf67db 100644
--- a/tests/mgbench/datasets.py
+++ b/tests/mgbench/datasets.py
@@ -45,13 +45,10 @@ class Dataset:
             variant = self.DEFAULT_VARIANT
         if variant not in self.VARIANTS:
             raise ValueError("Invalid test variant!")
-        if (self.FILES and variant not in self.FILES) and \
-                (self.URLS and variant not in self.URLS):
-            raise ValueError("The variant doesn't have a defined URL or "
-                             "file path!")
+        if (self.FILES and variant not in self.FILES) and (self.URLS and variant not in self.URLS):
+            raise ValueError("The variant doesn't have a defined URL or " "file path!")
         if variant not in self.SIZES:
-            raise ValueError("The variant doesn't have a defined dataset "
-                             "size!")
+            raise ValueError("The variant doesn't have a defined dataset " "size!")
         self._variant = variant
         if self.FILES is not None:
             self._file = self.FILES.get(variant, None)
@@ -63,8 +60,7 @@ class Dataset:
             self._url = None
         self._size = self.SIZES[variant]
         if "vertices" not in self._size or "edges" not in self._size:
-            raise ValueError("The size defined for this variant doesn't "
-                             "have the number of vertices and/or edges!")
+            raise ValueError("The size defined for this variant doesn't " "have the number of vertices and/or edges!")
         self._num_vertices = self._size["vertices"]
         self._num_edges = self._size["edges"]
 
@@ -76,8 +72,7 @@ class Dataset:
         cached_input, exists = directory.get_file("dataset.cypher")
         if not exists:
             print("Downloading dataset file:", self._url)
-            downloaded_file = helpers.download_file(
-                    self._url, directory.get_path())
+            downloaded_file = helpers.download_file(self._url, directory.get_path())
             print("Unpacking and caching file:", downloaded_file)
             helpers.unpack_and_move_file(downloaded_file, cached_input)
         print("Using cached dataset file:", cached_input)
@@ -137,18 +132,17 @@ class Pokec(Dataset):
     # Arango benchmarks
 
     def benchmark__arango__single_vertex_read(self):
-        return ("MATCH (n:User {id : $id}) RETURN n",
-                {"id": self._get_random_vertex()})
+        return ("MATCH (n:User {id : $id}) RETURN n", {"id": self._get_random_vertex()})
 
     def benchmark__arango__single_vertex_write(self):
-        return ("CREATE (n:UserTemp {id : $id}) RETURN n",
-                {"id": random.randint(1, self._num_vertices * 10)})
+        return ("CREATE (n:UserTemp {id : $id}) RETURN n", {"id": random.randint(1, self._num_vertices * 10)})
 
     def benchmark__arango__single_edge_write(self):
         vertex_from, vertex_to = self._get_random_from_to()
-        return ("MATCH (n:User {id: $from}), (m:User {id: $to}) WITH n, m "
-                "CREATE (n)-[e:Temp]->(m) RETURN e",
-                {"from": vertex_from, "to": vertex_to})
+        return (
+            "MATCH (n:User {id: $from}), (m:User {id: $to}) WITH n, m " "CREATE (n)-[e:Temp]->(m) RETURN e",
+            {"from": vertex_from, "to": vertex_to},
+        )
 
     def benchmark__arango__aggregate(self):
         return ("MATCH (n:User) RETURN n.age, COUNT(*)", {})
@@ -157,92 +151,94 @@ class Pokec(Dataset):
         return ("MATCH (n:User) WHERE n.age >= 18 RETURN n.age, COUNT(*)", {})
 
     def benchmark__arango__expansion_1(self):
-        return ("MATCH (s:User {id: $id})-->(n:User) "
-                "RETURN n.id",
-                {"id": self._get_random_vertex()})
+        return ("MATCH (s:User {id: $id})-->(n:User) " "RETURN n.id", {"id": self._get_random_vertex()})
 
     def benchmark__arango__expansion_1_with_filter(self):
-        return ("MATCH (s:User {id: $id})-->(n:User) "
-                "WHERE n.age >= 18 "
-                "RETURN n.id",
-                {"id": self._get_random_vertex()})
+        return (
+            "MATCH (s:User {id: $id})-->(n:User) " "WHERE n.age >= 18 " "RETURN n.id",
+            {"id": self._get_random_vertex()},
+        )
 
     def benchmark__arango__expansion_2(self):
-        return ("MATCH (s:User {id: $id})-->()-->(n:User) "
-                "RETURN DISTINCT n.id",
-                {"id": self._get_random_vertex()})
+        return ("MATCH (s:User {id: $id})-->()-->(n:User) " "RETURN DISTINCT n.id", {"id": self._get_random_vertex()})
 
     def benchmark__arango__expansion_2_with_filter(self):
-        return ("MATCH (s:User {id: $id})-->()-->(n:User) "
-                "WHERE n.age >= 18 "
-                "RETURN DISTINCT n.id",
-                {"id": self._get_random_vertex()})
+        return (
+            "MATCH (s:User {id: $id})-->()-->(n:User) " "WHERE n.age >= 18 " "RETURN DISTINCT n.id",
+            {"id": self._get_random_vertex()},
+        )
 
     def benchmark__arango__expansion_3(self):
-        return ("MATCH (s:User {id: $id})-->()-->()-->(n:User) "
-                "RETURN DISTINCT n.id",
-                {"id": self._get_random_vertex()})
+        return (
+            "MATCH (s:User {id: $id})-->()-->()-->(n:User) " "RETURN DISTINCT n.id",
+            {"id": self._get_random_vertex()},
+        )
 
     def benchmark__arango__expansion_3_with_filter(self):
-        return ("MATCH (s:User {id: $id})-->()-->()-->(n:User) "
-                "WHERE n.age >= 18 "
-                "RETURN DISTINCT n.id",
-                {"id": self._get_random_vertex()})
+        return (
+            "MATCH (s:User {id: $id})-->()-->()-->(n:User) " "WHERE n.age >= 18 " "RETURN DISTINCT n.id",
+            {"id": self._get_random_vertex()},
+        )
 
     def benchmark__arango__expansion_4(self):
-        return ("MATCH (s:User {id: $id})-->()-->()-->()-->(n:User) "
-                "RETURN DISTINCT n.id",
-                {"id": self._get_random_vertex()})
+        return (
+            "MATCH (s:User {id: $id})-->()-->()-->()-->(n:User) " "RETURN DISTINCT n.id",
+            {"id": self._get_random_vertex()},
+        )
 
     def benchmark__arango__expansion_4_with_filter(self):
-        return ("MATCH (s:User {id: $id})-->()-->()-->()-->(n:User) "
-                "WHERE n.age >= 18 "
-                "RETURN DISTINCT n.id",
-                {"id": self._get_random_vertex()})
+        return (
+            "MATCH (s:User {id: $id})-->()-->()-->()-->(n:User) " "WHERE n.age >= 18 " "RETURN DISTINCT n.id",
+            {"id": self._get_random_vertex()},
+        )
 
     def benchmark__arango__neighbours_2(self):
-        return ("MATCH (s:User {id: $id})-[*1..2]->(n:User) "
-                "RETURN DISTINCT n.id",
-                {"id": self._get_random_vertex()})
+        return ("MATCH (s:User {id: $id})-[*1..2]->(n:User) " "RETURN DISTINCT n.id", {"id": self._get_random_vertex()})
 
     def benchmark__arango__neighbours_2_with_filter(self):
-        return ("MATCH (s:User {id: $id})-[*1..2]->(n:User) "
-                "WHERE n.age >= 18 "
-                "RETURN DISTINCT n.id",
-                {"id": self._get_random_vertex()})
+        return (
+            "MATCH (s:User {id: $id})-[*1..2]->(n:User) " "WHERE n.age >= 18 " "RETURN DISTINCT n.id",
+            {"id": self._get_random_vertex()},
+        )
 
     def benchmark__arango__neighbours_2_with_data(self):
-        return ("MATCH (s:User {id: $id})-[*1..2]->(n:User) "
-                "RETURN DISTINCT n.id, n",
-                {"id": self._get_random_vertex()})
+        return (
+            "MATCH (s:User {id: $id})-[*1..2]->(n:User) " "RETURN DISTINCT n.id, n",
+            {"id": self._get_random_vertex()},
+        )
 
     def benchmark__arango__neighbours_2_with_data_and_filter(self):
-        return ("MATCH (s:User {id: $id})-[*1..2]->(n:User) "
-                "WHERE n.age >= 18 "
-                "RETURN DISTINCT n.id, n",
-                {"id": self._get_random_vertex()})
+        return (
+            "MATCH (s:User {id: $id})-[*1..2]->(n:User) " "WHERE n.age >= 18 " "RETURN DISTINCT n.id, n",
+            {"id": self._get_random_vertex()},
+        )
 
     def benchmark__arango__shortest_path(self):
         vertex_from, vertex_to = self._get_random_from_to()
-        return ("MATCH (n:User {id: $from}), (m:User {id: $to}) WITH n, m "
-                "MATCH p=(n)-[*bfs..15]->(m) "
-                "RETURN extract(n in nodes(p) | n.id) AS path",
-                {"from": vertex_from, "to": vertex_to})
+        return (
+            "MATCH (n:User {id: $from}), (m:User {id: $to}) WITH n, m "
+            "MATCH p=(n)-[*bfs..15]->(m) "
+            "RETURN extract(n in nodes(p) | n.id) AS path",
+            {"from": vertex_from, "to": vertex_to},
+        )
 
     def benchmark__arango__shortest_path_with_filter(self):
         vertex_from, vertex_to = self._get_random_from_to()
-        return ("MATCH (n:User {id: $from}), (m:User {id: $to}) WITH n, m "
-                "MATCH p=(n)-[*bfs..15 (e, n | n.age >= 18)]->(m) "
-                "RETURN extract(n in nodes(p) | n.id) AS path",
-                {"from": vertex_from, "to": vertex_to})
+        return (
+            "MATCH (n:User {id: $from}), (m:User {id: $to}) WITH n, m "
+            "MATCH p=(n)-[*bfs..15 (e, n | n.age >= 18)]->(m) "
+            "RETURN extract(n in nodes(p) | n.id) AS path",
+            {"from": vertex_from, "to": vertex_to},
+        )
 
     # Our benchmark queries
 
     def benchmark__create__edge(self):
         vertex_from, vertex_to = self._get_random_from_to()
-        return ("MATCH (a:User {id: $from}), (b:User {id: $to}) "
-                "CREATE (a)-[:TempEdge]->(b)",
-                {"from": vertex_from, "to": vertex_to})
+        return (
+            "MATCH (a:User {id: $from}), (b:User {id: $to}) " "CREATE (a)-[:TempEdge]->(b)",
+            {"from": vertex_from, "to": vertex_to},
+        )
 
     def benchmark__create__pattern(self):
         return ("CREATE ()-[:TempEdge]->()", {})
@@ -251,9 +247,12 @@ class Pokec(Dataset):
         return ("CREATE ()", {})
 
     def benchmark__create__vertex_big(self):
-        return ("CREATE (:L1:L2:L3:L4:L5:L6:L7 {p1: true, p2: 42, "
-                "p3: \"Here is some text that is not extremely short\", "
-                "p4:\"Short text\", p5: 234.434, p6: 11.11, p7: false})", {})
+        return (
+            "CREATE (:L1:L2:L3:L4:L5:L6:L7 {p1: true, p2: 42, "
+            'p3: "Here is some text that is not extremely short", '
+            'p4:"Short text", p5: 234.434, p6: 11.11, p7: false})',
+            {},
+        )
 
     def benchmark__aggregation__count(self):
         return ("MATCH (n) RETURN count(n), count(n.age)", {})
@@ -262,29 +261,124 @@ class Pokec(Dataset):
         return ("MATCH (n) RETURN min(n.age), max(n.age), avg(n.age)", {})
 
     def benchmark__match__pattern_cycle(self):
-        return ("MATCH (n:User {id: $id})-[e1]->(m)-[e2]->(n) "
-                "RETURN e1, m, e2",
-                {"id": self._get_random_vertex()})
+        return ("MATCH (n:User {id: $id})-[e1]->(m)-[e2]->(n) " "RETURN e1, m, e2", {"id": self._get_random_vertex()})
 
     def benchmark__match__pattern_long(self):
-        return ("MATCH (n1:User {id: $id})-[e1]->(n2)-[e2]->"
-                "(n3)-[e3]->(n4)<-[e4]-(n5) "
-                "RETURN n5 LIMIT 1",
-                {"id": self._get_random_vertex()})
+        return (
+            "MATCH (n1:User {id: $id})-[e1]->(n2)-[e2]->" "(n3)-[e3]->(n4)<-[e4]-(n5) " "RETURN n5 LIMIT 1",
+            {"id": self._get_random_vertex()},
+        )
 
     def benchmark__match__pattern_short(self):
-        return ("MATCH (n:User {id: $id})-[e]->(m) "
-                "RETURN m LIMIT 1",
-                {"id": self._get_random_vertex()})
+        return ("MATCH (n:User {id: $id})-[e]->(m) " "RETURN m LIMIT 1", {"id": self._get_random_vertex()})
 
     def benchmark__match__vertex_on_label_property(self):
-        return ("MATCH (n:User) WITH n WHERE n.id = $id RETURN n",
-                {"id": self._get_random_vertex()})
+        return ("MATCH (n:User) WITH n WHERE n.id = $id RETURN n", {"id": self._get_random_vertex()})
 
     def benchmark__match__vertex_on_label_property_index(self):
-        return ("MATCH (n:User {id: $id}) RETURN n",
-                {"id": self._get_random_vertex()})
+        return ("MATCH (n:User {id: $id}) RETURN n", {"id": self._get_random_vertex()})
 
     def benchmark__match__vertex_on_property(self):
-        return ("MATCH (n {id: $id}) RETURN n",
-                {"id": self._get_random_vertex()})
+        return ("MATCH (n {id: $id}) RETURN n", {"id": self._get_random_vertex()})
+
+
+class Distributed(Dataset):
+
+    # Explaination of datasets:
+    #   - empty_only_index: contains index; contains no data
+    #   - small/medium/large: contains index; contains data (respectively small/medium/large dataset)
+    #
+    # See dataset_creator.py to understand the datamodel and generate a dataset
+
+    NAME = "distributed"
+    VARIANTS = ["empty_only_index", "small", "medium", "large"]
+    DEFAULT_VARIANT = "empty_only_index"
+    URLS = {
+        "empty_only_index": "https://s3-eu-west-1.amazonaws.com/deps.memgraph.io/distributed_empty_only_index.setup.cypher.gz",
+        "small": "https://s3-eu-west-1.amazonaws.com/deps.memgraph.io/distributed_small.setup.cypher.gz",
+        "medium": "https://s3-eu-west-1.amazonaws.com/deps.memgraph.io/distributed_medium.setup.cypher.gz",
+        "large": "https://s3-eu-west-1.amazonaws.com/deps.memgraph.io/distributed_large.setup.cypher.gz",
+    }
+    SIZES = {
+        "empty_only_index": {
+            "vertices": 0,
+            "edges": -1,  # not used
+            "uuid_ranges": {
+                "User": {"first_uuid": 0, "last_uuid": 0},
+                "Permission": {"first_uuid": 0, "last_uuid": 0},
+                "Identity": {"first_uuid": 0, "last_uuid": 0},
+            },
+        },
+        "small": {
+            "vertices": 30,
+            "edges": -1,  # not used
+            "uuid_ranges": {
+                "User": {"first_uuid": 1, "last_uuid": 10},
+                "Permission": {"first_uuid": 11, "last_uuid": 20},
+                "Identity": {"first_uuid": 21, "last_uuid": 30},
+            },
+        },
+        "medium": {
+            "vertices": 30000,
+            "edges": -1,  # not used
+            "uuid_ranges": {
+                "User": {"first_uuid": 1, "last_uuid": 10000},
+                "Permission": {"first_uuid": 10001, "last_uuid": 20000},
+                "Identity": {"first_uuid": 10001, "last_uuid": 30000},
+            },
+        },
+        "large": {
+            "vertices": 3000000,
+            "edges": -1,  # not used
+            "uuid_ranges": {
+                "User": {"first_uuid": 1, "last_uuid": 1000000},
+                "Permission": {"first_uuid": 100001, "last_uuid": 2000000},
+                "Identity": {"first_uuid": 1000001, "last_uuid": 3000000},
+            },
+        },
+    }
+
+    def _get_random_uuid(self, type):
+        assert type in ["User", "Permission", "Identity"]
+
+        first_uuid = Dataset.get_size(self)["uuid_ranges"][type]["first_uuid"]
+        last_uuid = Dataset.get_size(self)["uuid_ranges"][type]["last_uuid"]
+
+        random_value = random.randint(first_uuid, last_uuid)
+        return random_value
+
+    def __init__(self, variant=None):
+        Dataset.__init__(self, variant)
+        self.next_value_idx = Dataset.get_size(self)["vertices"] + 1
+
+    def benchmark__create__vertex(self):
+        self.next_value_idx += 1
+        query = (f"CREATE (:User {{uuid: {self.next_value_idx}}});", {})
+        return query
+
+    def benchmark__create__edges(self):
+        permission_uuid = self._get_random_uuid("Permission")
+        user_uuid = self._get_random_uuid("User")
+
+        query = (
+            "MATCH (permission:Permission {uuid: $permission_uuid}), (user:User {uuid: $user_uuid}) "
+            "CREATE (permission)-[:IS_FOR_USER]->(user)",
+            {"permission_uuid": permission_uuid, "user_uuid": user_uuid},
+        )
+
+        return query
+
+    def benchmark__match__match_all_vertices(self):
+        self.next_value_idx += 1
+        query = ("MATCH (n) RETURN *", {})
+        return query
+
+    def benchmark__match__match_on_labelled_vertices(self):
+        self.next_value_idx += 1
+        query = ("MATCH (n:User) RETURN *", {})
+        return query
+
+    def benchmark__match__match_all_verteices_with_edges(self):
+        self.next_value_idx += 1
+        query = ("MATCH (permission:Permission)-[e:IS_FOR_USER]->(user:User) RETURN *", {})
+        return query

From 58243f4a268e0a0745060f2247da07e466b93069 Mon Sep 17 00:00:00 2001
From: jeremy <jeremy.bailleux@memgraph.io>
Date: Tue, 18 Oct 2022 15:47:13 +0200
Subject: [PATCH 02/38] Rename User->File Use parser for argument i.o. simple
 variable in script

---
 tests/mgbench/dataset_creator.py | 118 ++++++++++++++++++-------------
 tests/mgbench/datasets.py        |  36 +++++-----
 2 files changed, 88 insertions(+), 66 deletions(-)

diff --git a/tests/mgbench/dataset_creator.py b/tests/mgbench/dataset_creator.py
index 432a44ec7..754419ccc 100644
--- a/tests/mgbench/dataset_creator.py
+++ b/tests/mgbench/dataset_creator.py
@@ -10,8 +10,8 @@
 # licenses/APL.txt.
 
 import random
-
 import helpers
+import argparse
 
 # Explaination of datasets:
 #   - empty_only_index: contains index; contains no data
@@ -22,73 +22,95 @@ import helpers
 #                               ┌──────────────┐
 #                               │ Permission   │
 #         ┌────────────────┐    │  Schema:uuid │   ┌────────────┐
-#         │:IS_FOR_IDENTITY├────┤  Index:name  ├───┤:IS_FOR_USER│
+#         │:IS_FOR_IDENTITY├────┤  Index:name  ├───┤:IS_FOR_FILE│
 #         └┬───────────────┘    └──────────────┘   └────────────┤
 #          │                                                    │
-#   ┌──────▼──────────────┐                                  ┌──▼───────────┐
-#   │  Identity           │                                  │ User         │
-#   │   Schema:uuid       │                                  │  Schema:uuid │
-#   │   Index:platformId  │                                  │  Index:email │
-#   │   Index:name        │                                  └──────────────┘
-#   └─────────────────────┘
+#   ┌──────▼──────────────┐                                  ┌──▼────────────────┐
+#   │  Identity           │                                  │ File              │
+#   │   Schema:uuid       │                                  │  Schema:uuid      │
+#   │   Index:email       │                                  │  Index:name       │
+#   └─────────────────────┘                                  │  Index:platformId │
+#                                                            └───────────────────┘
 #
-#
-#   - User: attributes: ["uuid", "name", "platformId"]
+#   - File: attributes: ["uuid", "name", "platformId"]
 #   - Permission: attributes: ["uuid", "name"]
 #   - Identity: attributes: ["uuid", "email"]
 #
 # Indexes:
-#   - User: [User(uuid), User(platformId), User(name)]
+#   - File: [File(uuid), File(platformId), File(name)]
 #   - Permission: [Permission(uuid), Permission(name)]
 #   - Identity: [Identity(uuid), Identity(email)]
 #
 # Edges:
-#   - (:Permission)-[:IS_FOR_USER]->(:User)
+#   - (:Permission)-[:IS_FOR_FILE]->(:File)
 #   - (:Permission)-[:IS_FOR_IDENTITYR]->(:Identity)
 #
-# Distributed specific: uuid is the schema
+# AccessControl specific: uuid is the schema
 
-filename = "dataset.cypher"
-f = open(filename, "x")
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--number_of_identities", type=int, default=10)
+    parser.add_argument("--number_of_files", type=int, default=10)
+    parser.add_argument("--percentage_of_permissions", type=float, default=1.0)
+    parser.add_argument("--filename", default="dataset.cypher")
 
-f.write("MATCH (n) DETACH DELETE n;\n")
+    args = parser.parse_args()
 
-# Create the indexes
-f.write("CREATE INDEX ON :User;\n")
-f.write("CREATE INDEX ON :Permission;\n")
-f.write("CREATE INDEX ON :Identity;\n")
-f.write("CREATE INDEX ON :User(platformId);\n")
-f.write("CREATE INDEX ON :User(name);\n")
-f.write("CREATE INDEX ON :Permission(name);\n")
-f.write("CREATE INDEX ON :Identity(email);\n")
+    number_of_identities = args.number_of_identities
+    number_of_files = args.number_of_files
+    percentage_of_permissions = args.percentage_of_permissions
+    filename = args.filename
 
-# Create extra index: in distributed, this will be the schema
-f.write("CREATE INDEX ON :User(uuid);\n")
-f.write("CREATE INDEX ON :Permission(uuid);\n")
-f.write("CREATE INDEX ON :Identity(uuid);\n")
+    assert number_of_identities > 0
+    assert number_of_files > 0
+    assert percentage_of_permissions > 0.0 and percentage_of_permissions <= 1.0
+    assert filename != ""
 
-platform_ids = [f"somePlatformId_{id}" for id in range(10)]
+    f = open(filename, "w")
 
-# This is the number of clusters to change if you want a bigger dataset
-number_of_clusters = 3000000
+    f.write("MATCH (n) DETACH DELETE n;\n")
 
-for index in range(1, number_of_clusters + 1):
-    platform_id = platform_ids[random.randint(0, len(platform_ids) - 1)]
-    user_uuid = index
-    platform_uuid = number_of_clusters + index
-    identity_uuid = 2 * number_of_clusters + index
+    # Create the indexes
+    f.write("CREATE INDEX ON :File;\n")
+    f.write("CREATE INDEX ON :Permission;\n")
+    f.write("CREATE INDEX ON :Identity;\n")
+    f.write("CREATE INDEX ON :File(platformId);\n")
+    f.write("CREATE INDEX ON :File(name);\n")
+    f.write("CREATE INDEX ON :Permission(name);\n")
+    f.write("CREATE INDEX ON :Identity(email);\n")
 
-    # Create the nodes
-    f.write(f'CREATE (:User {{uuid: {user_uuid}, platformId: "{platform_id}", name: "name_user_{user_uuid}"}});\n')
-    f.write(f'CREATE (:Permission {{uuid: {platform_uuid}, name: "name_permission_{platform_uuid}"}});\n')
-    f.write(f'CREATE (:Permission {{uuid: {identity_uuid}, name: "mail_{identity_uuid}@something.com"}});\n')
+    # Create extra index: in distributed, this will be the schema
+    f.write("CREATE INDEX ON :File(uuid);\n")
+    f.write("CREATE INDEX ON :Permission(uuid);\n")
+    f.write("CREATE INDEX ON :Identity(uuid);\n")
 
-    # Create the edges
-    f.write(
-        f"MATCH (permission:Permission {{uuid: {platform_uuid}}}), (user:User {{uuid: {user_uuid}}}) CREATE (permission)-[e: IS_FOR_USER]->(user);\n"
-    )
-    f.write(
-        f"MATCH (permission:Permission {{uuid: {platform_uuid}}}), (identity:Identity {{uuid: {identity_uuid}}}) CREATE (permission)-[e: IS_FOR_IDENTITY]->(identity);\n"
-    )
+    uuid = 1
 
-f.close()
+    files = []
+    # Create the nodes File
+    for index in range(0, number_of_files):
+        f.write(f'CREATE (:File {{uuid: {uuid}, platformId: platform_id, name: "name_file_{uuid}"}});\n')
+        uuid += 1
+
+    identities = []
+    # Create the nodes Identity
+    for index in range(0, number_of_identities):
+        f.write(f'CREATE (:Identity {{uuid: {uuid}, name: "mail_{uuid}@something.com"}});\n')
+        uuid += 1
+
+    for outer_index in range(0, number_of_files):
+        for inner_index in range(0, number_of_identities):
+            file_uuid = outer_index
+            identity_uuid = number_of_files + inner_index
+
+            if random.random() <= percentage_of_permissions:
+                f.write(f'CREATE (:Permission {{uuid: {uuid}, name: "name_permission_{uuid}"}});\n')
+                f.write(
+                    f"MATCH (permission:Permission {{uuid: {uuid}}}), (file:File {{uuid: {file_uuid}}}) CREATE (permission)-[e: IS_FOR_FILE]->(file);\n"
+                )
+                f.write(
+                    f"MATCH (permission:Permission {{uuid: {uuid}}}), (identity:Identity {{uuid: {identity_uuid}}}) CREATE (permission)-[e: IS_FOR_IDENTITY]->(identity);\n"
+                )
+                uuid += 1
+
+    f.close()
diff --git a/tests/mgbench/datasets.py b/tests/mgbench/datasets.py
index 45fdf67db..4c5f19043 100644
--- a/tests/mgbench/datasets.py
+++ b/tests/mgbench/datasets.py
@@ -282,7 +282,7 @@ class Pokec(Dataset):
         return ("MATCH (n {id: $id}) RETURN n", {"id": self._get_random_vertex()})
 
 
-class Distributed(Dataset):
+class AccessControl(Dataset):
 
     # Explaination of datasets:
     #   - empty_only_index: contains index; contains no data
@@ -290,21 +290,21 @@ class Distributed(Dataset):
     #
     # See dataset_creator.py to understand the datamodel and generate a dataset
 
-    NAME = "distributed"
+    NAME = "accesscontrol"
     VARIANTS = ["empty_only_index", "small", "medium", "large"]
     DEFAULT_VARIANT = "empty_only_index"
     URLS = {
-        "empty_only_index": "https://s3-eu-west-1.amazonaws.com/deps.memgraph.io/distributed_empty_only_index.setup.cypher.gz",
-        "small": "https://s3-eu-west-1.amazonaws.com/deps.memgraph.io/distributed_small.setup.cypher.gz",
-        "medium": "https://s3-eu-west-1.amazonaws.com/deps.memgraph.io/distributed_medium.setup.cypher.gz",
-        "large": "https://s3-eu-west-1.amazonaws.com/deps.memgraph.io/distributed_large.setup.cypher.gz",
+        "empty_only_index": "https://s3-eu-west-1.amazonaws.com/deps.memgraph.io/accesscontrol_empty_only_index.setup.cypher.gz",
+        "small": "https://s3-eu-west-1.amazonaws.com/deps.memgraph.io/accesscontrol_small.setup.cypher.gz",
+        "medium": "https://s3-eu-west-1.amazonaws.com/deps.memgraph.io/accesscontrol_medium.setup.cypher.gz",
+        "large": "https://s3-eu-west-1.amazonaws.com/deps.memgraph.io/accesscontrol_large.setup.cypher.gz",
     }
     SIZES = {
         "empty_only_index": {
             "vertices": 0,
             "edges": -1,  # not used
             "uuid_ranges": {
-                "User": {"first_uuid": 0, "last_uuid": 0},
+                "File": {"first_uuid": 0, "last_uuid": 0},
                 "Permission": {"first_uuid": 0, "last_uuid": 0},
                 "Identity": {"first_uuid": 0, "last_uuid": 0},
             },
@@ -313,7 +313,7 @@ class Distributed(Dataset):
             "vertices": 30,
             "edges": -1,  # not used
             "uuid_ranges": {
-                "User": {"first_uuid": 1, "last_uuid": 10},
+                "File": {"first_uuid": 1, "last_uuid": 10},
                 "Permission": {"first_uuid": 11, "last_uuid": 20},
                 "Identity": {"first_uuid": 21, "last_uuid": 30},
             },
@@ -322,7 +322,7 @@ class Distributed(Dataset):
             "vertices": 30000,
             "edges": -1,  # not used
             "uuid_ranges": {
-                "User": {"first_uuid": 1, "last_uuid": 10000},
+                "File": {"first_uuid": 1, "last_uuid": 10000},
                 "Permission": {"first_uuid": 10001, "last_uuid": 20000},
                 "Identity": {"first_uuid": 10001, "last_uuid": 30000},
             },
@@ -331,7 +331,7 @@ class Distributed(Dataset):
             "vertices": 3000000,
             "edges": -1,  # not used
             "uuid_ranges": {
-                "User": {"first_uuid": 1, "last_uuid": 1000000},
+                "File": {"first_uuid": 1, "last_uuid": 1000000},
                 "Permission": {"first_uuid": 100001, "last_uuid": 2000000},
                 "Identity": {"first_uuid": 1000001, "last_uuid": 3000000},
             },
@@ -339,7 +339,7 @@ class Distributed(Dataset):
     }
 
     def _get_random_uuid(self, type):
-        assert type in ["User", "Permission", "Identity"]
+        assert type in ["File", "Permission", "Identity"]
 
         first_uuid = Dataset.get_size(self)["uuid_ranges"][type]["first_uuid"]
         last_uuid = Dataset.get_size(self)["uuid_ranges"][type]["last_uuid"]
@@ -353,17 +353,17 @@ class Distributed(Dataset):
 
     def benchmark__create__vertex(self):
         self.next_value_idx += 1
-        query = (f"CREATE (:User {{uuid: {self.next_value_idx}}});", {})
+        query = (f"CREATE (:File {{uuid: {self.next_value_idx}}});", {})
         return query
 
     def benchmark__create__edges(self):
         permission_uuid = self._get_random_uuid("Permission")
-        user_uuid = self._get_random_uuid("User")
+        file_uuid = self._get_random_uuid("File")
 
         query = (
-            "MATCH (permission:Permission {uuid: $permission_uuid}), (user:User {uuid: $user_uuid}) "
-            "CREATE (permission)-[:IS_FOR_USER]->(user)",
-            {"permission_uuid": permission_uuid, "user_uuid": user_uuid},
+            "MATCH (permission:Permission {uuid: $permission_uuid}), (file:File {uuid: $file_uuid}) "
+            "CREATE (permission)-[:IS_FOR_FILE]->(file)",
+            {"permission_uuid": permission_uuid, "file_uuid": file_uuid},
         )
 
         return query
@@ -375,10 +375,10 @@ class Distributed(Dataset):
 
     def benchmark__match__match_on_labelled_vertices(self):
         self.next_value_idx += 1
-        query = ("MATCH (n:User) RETURN *", {})
+        query = ("MATCH (n:File) RETURN *", {})
         return query
 
     def benchmark__match__match_all_verteices_with_edges(self):
         self.next_value_idx += 1
-        query = ("MATCH (permission:Permission)-[e:IS_FOR_USER]->(user:User) RETURN *", {})
+        query = ("MATCH (permission:Permission)-[e:IS_FOR_FILE]->(file:File) RETURN *", {})
         return query

From ddb30f49ea843f703ff33231e83edae9bbe5d7b2 Mon Sep 17 00:00:00 2001
From: Jeremy B <97525434+42jeremy@users.noreply.github.com>
Date: Mon, 31 Oct 2022 09:43:22 +0100
Subject: [PATCH 03/38] Update datasets.py

---
 tests/mgbench/datasets.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/mgbench/datasets.py b/tests/mgbench/datasets.py
index 4c5f19043..c2e8499bd 100644
--- a/tests/mgbench/datasets.py
+++ b/tests/mgbench/datasets.py
@@ -378,7 +378,7 @@ class AccessControl(Dataset):
         query = ("MATCH (n:File) RETURN *", {})
         return query
 
-    def benchmark__match__match_all_verteices_with_edges(self):
+    def benchmark__match__match_all_vertices_with_edges(self):
         self.next_value_idx += 1
         query = ("MATCH (permission:Permission)-[e:IS_FOR_FILE]->(file:File) RETURN *", {})
         return query

From 03c095e780df92f41da1c298a05030a245610d6c Mon Sep 17 00:00:00 2001
From: jeremy <jeremy.bailleux@memgraph.io>
Date: Mon, 31 Oct 2022 10:52:06 +0100
Subject: [PATCH 04/38] Update assert

---
 tests/mgbench/dataset_creator.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/mgbench/dataset_creator.py b/tests/mgbench/dataset_creator.py
index 754419ccc..1b48838b6 100644
--- a/tests/mgbench/dataset_creator.py
+++ b/tests/mgbench/dataset_creator.py
@@ -61,8 +61,8 @@ if __name__ == "__main__":
     percentage_of_permissions = args.percentage_of_permissions
     filename = args.filename
 
-    assert number_of_identities > 0
-    assert number_of_files > 0
+    assert number_of_identities >= 0
+    assert number_of_files >= 0
     assert percentage_of_permissions > 0.0 and percentage_of_permissions <= 1.0
     assert filename != ""
 

From 5ef08f841ae8a0d6a299020e37f4da2220057c4a Mon Sep 17 00:00:00 2001
From: jeremy <jeremy.bailleux@memgraph.io>
Date: Mon, 31 Oct 2022 11:56:20 +0100
Subject: [PATCH 05/38] Update Dataset creation script

---
 tests/mgbench/dataset_creator.py |  4 ++--
 tests/mgbench/datasets.py        | 26 +++++++++++++-------------
 2 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/tests/mgbench/dataset_creator.py b/tests/mgbench/dataset_creator.py
index 1b48838b6..c63ae9731 100644
--- a/tests/mgbench/dataset_creator.py
+++ b/tests/mgbench/dataset_creator.py
@@ -100,8 +100,8 @@ if __name__ == "__main__":
 
     for outer_index in range(0, number_of_files):
         for inner_index in range(0, number_of_identities):
-            file_uuid = outer_index
-            identity_uuid = number_of_files + inner_index
+            file_uuid = outer_index + 1
+            identity_uuid = number_of_files + inner_index + 1
 
             if random.random() <= percentage_of_permissions:
                 f.write(f'CREATE (:Permission {{uuid: {uuid}, name: "name_permission_{uuid}"}});\n')
diff --git a/tests/mgbench/datasets.py b/tests/mgbench/datasets.py
index c2e8499bd..953111807 100644
--- a/tests/mgbench/datasets.py
+++ b/tests/mgbench/datasets.py
@@ -314,26 +314,26 @@ class AccessControl(Dataset):
             "edges": -1,  # not used
             "uuid_ranges": {
                 "File": {"first_uuid": 1, "last_uuid": 10},
-                "Permission": {"first_uuid": 11, "last_uuid": 20},
-                "Identity": {"first_uuid": 21, "last_uuid": 30},
+                "Identity": {"first_uuid": 11, "last_uuid": 20},
+                "Permission": {"first_uuid": 21, "last_uuid": 120},  # 120=10*10+20
             },
         },
         "medium": {
+            "vertices": 3000,
+            "edges": -1,  # not used
+            "uuid_ranges": {
+                "File": {"first_uuid": 1, "last_uuid": 1000},
+                "Identity": {"first_uuid": 1001, "last_uuid": 2000},
+                "Permission": {"first_uuid": 2001, "last_uuid": 1002000},  # 1002000=1000*1000+2000
+            },
+        },
+        "large": {
             "vertices": 30000,
             "edges": -1,  # not used
             "uuid_ranges": {
                 "File": {"first_uuid": 1, "last_uuid": 10000},
-                "Permission": {"first_uuid": 10001, "last_uuid": 20000},
-                "Identity": {"first_uuid": 10001, "last_uuid": 30000},
-            },
-        },
-        "large": {
-            "vertices": 3000000,
-            "edges": -1,  # not used
-            "uuid_ranges": {
-                "File": {"first_uuid": 1, "last_uuid": 1000000},
-                "Permission": {"first_uuid": 100001, "last_uuid": 2000000},
-                "Identity": {"first_uuid": 1000001, "last_uuid": 3000000},
+                "Identity": {"first_uuid": 10001, "last_uuid": 20000},
+                "Permission": {"first_uuid": 20001, "last_uuid": 100020000},  # 100020000=10000*10000+20000
             },
         },
     }

From c5ee6ffbc2cfed49dfc2e6dba106b0fa6736747f Mon Sep 17 00:00:00 2001
From: jeremy <jeremy.bailleux@memgraph.io>
Date: Mon, 31 Oct 2022 12:41:28 +0100
Subject: [PATCH 06/38] Update dataset_creator script

---
 tests/mgbench/dataset_creator.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/mgbench/dataset_creator.py b/tests/mgbench/dataset_creator.py
index c63ae9731..dc98350cb 100644
--- a/tests/mgbench/dataset_creator.py
+++ b/tests/mgbench/dataset_creator.py
@@ -89,7 +89,7 @@ if __name__ == "__main__":
     files = []
     # Create the nodes File
     for index in range(0, number_of_files):
-        f.write(f'CREATE (:File {{uuid: {uuid}, platformId: platform_id, name: "name_file_{uuid}"}});\n')
+        f.write(f'CREATE (:File {{uuid: {uuid}, platformId: "platform_id", name: "name_file_{uuid}"}});\n')
         uuid += 1
 
     identities = []

From d62a45752a24e22063d9ae0d5eebe906b93a4608 Mon Sep 17 00:00:00 2001
From: jeremy <jeremy.bailleux@memgraph.io>
Date: Mon, 31 Oct 2022 12:55:38 +0100
Subject: [PATCH 07/38] Remove unused variable

---
 tests/mgbench/dataset_creator.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/mgbench/dataset_creator.py b/tests/mgbench/dataset_creator.py
index dc98350cb..712fae1bc 100644
--- a/tests/mgbench/dataset_creator.py
+++ b/tests/mgbench/dataset_creator.py
@@ -86,7 +86,6 @@ if __name__ == "__main__":
 
     uuid = 1
 
-    files = []
     # Create the nodes File
     for index in range(0, number_of_files):
         f.write(f'CREATE (:File {{uuid: {uuid}, platformId: "platform_id", name: "name_file_{uuid}"}});\n')

From 2898120eeb7a5a2398216c95f9dea2908ecf2328 Mon Sep 17 00:00:00 2001
From: Jeremy B <97525434+42jeremy@users.noreply.github.com>
Date: Mon, 31 Oct 2022 12:55:42 +0100
Subject: [PATCH 08/38] Update tests/mgbench/datasets.py

Co-authored-by: Jure Bajic <jure.bajic@memgraph.com>
---
 tests/mgbench/datasets.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/mgbench/datasets.py b/tests/mgbench/datasets.py
index 953111807..6e9a68186 100644
--- a/tests/mgbench/datasets.py
+++ b/tests/mgbench/datasets.py
@@ -348,7 +348,7 @@ class AccessControl(Dataset):
         return random_value
 
     def __init__(self, variant=None):
-        Dataset.__init__(self, variant)
+        super().__init__(self, variant)
         self.next_value_idx = Dataset.get_size(self)["vertices"] + 1
 
     def benchmark__create__vertex(self):

From bae8c084b119d87939f900273cddc853fa96a217 Mon Sep 17 00:00:00 2001
From: Jeremy B <97525434+42jeremy@users.noreply.github.com>
Date: Mon, 31 Oct 2022 12:56:02 +0100
Subject: [PATCH 09/38] Update tests/mgbench/datasets.py

Co-authored-by: Jure Bajic <jure.bajic@memgraph.com>
---
 tests/mgbench/datasets.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/mgbench/datasets.py b/tests/mgbench/datasets.py
index 6e9a68186..60660c297 100644
--- a/tests/mgbench/datasets.py
+++ b/tests/mgbench/datasets.py
@@ -341,8 +341,8 @@ class AccessControl(Dataset):
     def _get_random_uuid(self, type):
         assert type in ["File", "Permission", "Identity"]
 
-        first_uuid = Dataset.get_size(self)["uuid_ranges"][type]["first_uuid"]
-        last_uuid = Dataset.get_size(self)["uuid_ranges"][type]["last_uuid"]
+        first_uuid = self.get_size()["uuid_ranges"][type]["first_uuid"]
+        last_uuid = self.get_size()["uuid_ranges"][type]["last_uuid"]
 
         random_value = random.randint(first_uuid, last_uuid)
         return random_value

From f28ba89584b412e00c06e1401df5ab3677521813 Mon Sep 17 00:00:00 2001
From: Jeremy B <97525434+42jeremy@users.noreply.github.com>
Date: Mon, 31 Oct 2022 13:01:42 +0100
Subject: [PATCH 10/38] Update tests/mgbench/dataset_creator.py

Co-authored-by: Jure Bajic <jure.bajic@memgraph.com>
---
 tests/mgbench/dataset_creator.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/mgbench/dataset_creator.py b/tests/mgbench/dataset_creator.py
index 712fae1bc..78cf9ca64 100644
--- a/tests/mgbench/dataset_creator.py
+++ b/tests/mgbench/dataset_creator.py
@@ -1,4 +1,4 @@
-# Copyright 2021 Memgraph Ltd.
+# Copyright 2022 Memgraph Ltd.
 #
 # Use of this software is governed by the Business Source License
 # included in the file licenses/BSL.txt; by using this file, you agree to be bound by the terms of the Business Source

From f04e1cda4bb70ef7ea19856467a5797bb3f509ab Mon Sep 17 00:00:00 2001
From: jeremy <jeremy.bailleux@memgraph.io>
Date: Mon, 31 Oct 2022 13:02:05 +0100
Subject: [PATCH 11/38] Add function

---
 tests/mgbench/dataset_creator.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/tests/mgbench/dataset_creator.py b/tests/mgbench/dataset_creator.py
index 712fae1bc..7ba2827f5 100644
--- a/tests/mgbench/dataset_creator.py
+++ b/tests/mgbench/dataset_creator.py
@@ -47,7 +47,8 @@ import argparse
 #
 # AccessControl specific: uuid is the schema
 
-if __name__ == "__main__":
+
+def main():
     parser = argparse.ArgumentParser()
     parser.add_argument("--number_of_identities", type=int, default=10)
     parser.add_argument("--number_of_files", type=int, default=10)
@@ -113,3 +114,7 @@ if __name__ == "__main__":
                 uuid += 1
 
     f.close()
+
+
+if __name__ == "__main__":
+    main()

From acbf3c764c8e9d61375930da2bf91e7bddf4df2b Mon Sep 17 00:00:00 2001
From: jeremy <jeremy.bailleux@memgraph.io>
Date: Mon, 31 Oct 2022 13:35:41 +0100
Subject: [PATCH 12/38] Remove arg from __init__

---
 tests/mgbench/datasets.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/mgbench/datasets.py b/tests/mgbench/datasets.py
index 60660c297..6e9412af4 100644
--- a/tests/mgbench/datasets.py
+++ b/tests/mgbench/datasets.py
@@ -348,7 +348,7 @@ class AccessControl(Dataset):
         return random_value
 
     def __init__(self, variant=None):
-        super().__init__(self, variant)
+        super().__init__(variant)
         self.next_value_idx = Dataset.get_size(self)["vertices"] + 1
 
     def benchmark__create__vertex(self):

From c90b38faf0b27f897df5b1805d9a640fc7905c5d Mon Sep 17 00:00:00 2001
From: jeremy <jeremy.bailleux@memgraph.io>
Date: Mon, 31 Oct 2022 14:49:32 +0100
Subject: [PATCH 13/38] Update aws address for datasets

---
 tests/mgbench/datasets.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/mgbench/datasets.py b/tests/mgbench/datasets.py
index 6e9412af4..3f6ad9eb0 100644
--- a/tests/mgbench/datasets.py
+++ b/tests/mgbench/datasets.py
@@ -294,10 +294,10 @@ class AccessControl(Dataset):
     VARIANTS = ["empty_only_index", "small", "medium", "large"]
     DEFAULT_VARIANT = "empty_only_index"
     URLS = {
-        "empty_only_index": "https://s3-eu-west-1.amazonaws.com/deps.memgraph.io/accesscontrol_empty_only_index.setup.cypher.gz",
-        "small": "https://s3-eu-west-1.amazonaws.com/deps.memgraph.io/accesscontrol_small.setup.cypher.gz",
-        "medium": "https://s3-eu-west-1.amazonaws.com/deps.memgraph.io/accesscontrol_medium.setup.cypher.gz",
-        "large": "https://s3-eu-west-1.amazonaws.com/deps.memgraph.io/accesscontrol_large.setup.cypher.gz",
+        "empty_only_index": "https://s3.eu-west-1.amazonaws.com/deps.memgraph.io/dataset/accesscontrol/accesscontrol_empty_only_index.setup.cypher.gz",
+        "small": "https://s3.eu-west-1.amazonaws.com/deps.memgraph.io/dataset/accesscontrol/accesscontrol_small.setup.cypher.gz",
+        "medium": "https://s3.eu-west-1.amazonaws.com/deps.memgraph.io/dataset/accesscontrol/accesscontrol_medium.setup.cypher.gz",
+        "large": "https://s3.eu-west-1.amazonaws.com/deps.memgraph.io/dataset/accesscontrol/accesscontrol_large.setup.cypher.gz",
     }
     SIZES = {
         "empty_only_index": {

From e1f18f37337606ef763cecc5c05513d5659e1896 Mon Sep 17 00:00:00 2001
From: jeremy <jeremy.bailleux@memgraph.io>
Date: Mon, 31 Oct 2022 15:19:34 +0100
Subject: [PATCH 14/38] Update location of Pokec datasets on aws

---
 tests/mgbench/datasets.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/mgbench/datasets.py b/tests/mgbench/datasets.py
index 3f6ad9eb0..40a76b96f 100644
--- a/tests/mgbench/datasets.py
+++ b/tests/mgbench/datasets.py
@@ -104,9 +104,9 @@ class Pokec(Dataset):
     DEFAULT_VARIANT = "small"
     FILES = None
     URLS = {
-        "small": "https://s3-eu-west-1.amazonaws.com/deps.memgraph.io/pokec_small.setup.cypher",
-        "medium": "https://s3-eu-west-1.amazonaws.com/deps.memgraph.io/pokec_medium.setup.cypher",
-        "large": "https://s3-eu-west-1.amazonaws.com/deps.memgraph.io/pokec_large.setup.cypher.gz",
+        "small": "https://s3.eu-west-1.amazonaws.com/deps.memgraph.io/dataset/pokec/pokec_small.setup.cypher",
+        "medium": "https://s3.eu-west-1.amazonaws.com/deps.memgraph.io/dataset/pokec/pokec_medium.setup.cypher",
+        "large": "https://s3.eu-west-1.amazonaws.com/deps.memgraph.io/dataset/pokec/pokec_large.setup.cypher.gz",
     }
     SIZES = {
         "small": {"vertices": 10000, "edges": 121716},

From 4c5cd1f847c41d554ed9b213a1bb2aefb616bdc3 Mon Sep 17 00:00:00 2001
From: jeremy <jeremy.bailleux@memgraph.io>
Date: Mon, 31 Oct 2022 16:10:04 +0100
Subject: [PATCH 15/38] Add possibility to have MgBench working against local
 file

---
 tests/mgbench/helpers.py | 23 +++++++++++++++--------
 1 file changed, 15 insertions(+), 8 deletions(-)

diff --git a/tests/mgbench/helpers.py b/tests/mgbench/helpers.py
index 7488b1443..1a4cd3c3e 100644
--- a/tests/mgbench/helpers.py
+++ b/tests/mgbench/helpers.py
@@ -28,18 +28,25 @@ def get_binary_path(path, base=""):
 
 
 def download_file(url, path):
-    ret = subprocess.run(["wget", "-nv", "--content-disposition", url],
-                         stderr=subprocess.PIPE, cwd=path, check=True)
-    data = ret.stderr.decode("utf-8")
-    tmp = data.split("->")[1]
-    name = tmp[tmp.index('"') + 1:tmp.rindex('"')]
-    return os.path.join(path, name)
+    if "https://" in url:
+        ret = subprocess.run(
+            ["wget", "-nv", "--content-disposition", url], stderr=subprocess.PIPE, cwd=path, check=True
+        )
+        data = ret.stderr.decode("utf-8")
+        tmp = data.split("->")[1]
+        name = tmp[tmp.index('"') + 1 : tmp.rindex('"')]
+        return os.path.join(path, name)
+    else:
+        assert os.path.exists(url)
+        subprocess.run(["cp", url, path], stderr=subprocess.PIPE, cwd=path, check=True)
+        tmp = url.split("/")
+        name = tmp[len(tmp) - 1]
+        return os.path.join(path, name)
 
 
 def unpack_and_move_file(input_path, output_path):
     if input_path.endswith(".gz"):
-        subprocess.run(["gunzip", input_path],
-                       stdout=subprocess.DEVNULL, check=True)
+        subprocess.run(["gunzip", input_path], stdout=subprocess.DEVNULL, check=True)
         input_path = input_path[:-3]
     os.rename(input_path, output_path)
 

From 787987168cc6b2a956a227884ed53a869e459c6c Mon Sep 17 00:00:00 2001
From: jeremy <jeremy.bailleux@memgraph.io>
Date: Tue, 1 Nov 2022 12:51:01 +0100
Subject: [PATCH 16/38] Make benchmark work with any customer datasets

---
 tests/mgbench/benchmark.py | 221 ++++++++++++++++++++-----------------
 1 file changed, 118 insertions(+), 103 deletions(-)

diff --git a/tests/mgbench/benchmark.py b/tests/mgbench/benchmark.py
index 5ce715571..498f04f44 100755
--- a/tests/mgbench/benchmark.py
+++ b/tests/mgbench/benchmark.py
@@ -25,6 +25,7 @@ import datasets
 import log
 import helpers
 import runners
+import importlib
 
 
 def get_queries(gen, count):
@@ -37,8 +38,7 @@ def get_queries(gen, count):
     return ret
 
 
-def match_patterns(dataset, variant, group, test, is_default_variant,
-                   patterns):
+def match_patterns(dataset, variant, group, test, is_default_variant, patterns):
     for pattern in patterns:
         verdict = [fnmatch.fnmatchcase(dataset, pattern[0])]
         if pattern[1] != "":
@@ -58,7 +58,7 @@ def filter_benchmarks(generators, patterns):
         pattern = patterns[i].split("/")
         if len(pattern) > 4 or len(pattern) == 0:
             raise Exception("Invalid benchmark description '" + pattern + "'!")
-        pattern.extend(["", "*", "*"][len(pattern) - 1:])
+        pattern.extend(["", "*", "*"][len(pattern) - 1 :])
         patterns[i] = pattern
     filtered = []
     for dataset in sorted(generators.keys()):
@@ -68,8 +68,7 @@ def filter_benchmarks(generators, patterns):
             current = collections.defaultdict(list)
             for group in tests:
                 for test_name, test_func in tests[group]:
-                    if match_patterns(dataset, variant, group, test_name,
-                                      is_default_variant, patterns):
+                    if match_patterns(dataset, variant, group, test_name, is_default_variant, patterns):
                         current[group].append((test_name, test_func))
             if len(current) > 0:
                 filtered.append((generator(variant), dict(current)))
@@ -78,54 +77,61 @@ def filter_benchmarks(generators, patterns):
 
 # Parse options.
 parser = argparse.ArgumentParser(
-    description="Memgraph benchmark executor.",
-    formatter_class=argparse.ArgumentDefaultsHelpFormatter)
-parser.add_argument("benchmarks", nargs="*", default="",
-                    help="descriptions of benchmarks that should be run; "
-                    "multiple descriptions can be specified to run multiple "
-                    "benchmarks; the description is specified as "
-                    "dataset/variant/group/test; Unix shell-style wildcards "
-                    "can be used in the descriptions; variant, group and test "
-                    "are optional and they can be left out; the default "
-                    "variant is '' which selects the default dataset variant; "
-                    "the default group is '*' which selects all groups; the "
-                    "default test is '*' which selects all tests")
-parser.add_argument("--memgraph-binary",
-                    default=helpers.get_binary_path("memgraph"),
-                    help="Memgraph binary used for benchmarking")
-parser.add_argument("--client-binary",
-                    default=helpers.get_binary_path("tests/mgbench/client"),
-                    help="client binary used for benchmarking")
-parser.add_argument("--num-workers-for-import", type=int,
-                    default=multiprocessing.cpu_count() // 2,
-                    help="number of workers used to import the dataset")
-parser.add_argument("--num-workers-for-benchmark", type=int,
-                    default=1,
-                    help="number of workers used to execute the benchmark")
-parser.add_argument("--single-threaded-runtime-sec", type=int,
-                    default=10,
-                    help="single threaded duration of each test")
-parser.add_argument("--no-load-query-counts", action="store_true",
-                    help="disable loading of cached query counts")
-parser.add_argument("--no-save-query-counts", action="store_true",
-                    help="disable storing of cached query counts")
-parser.add_argument("--export-results", default="",
-                    help="file path into which results should be exported")
-parser.add_argument("--temporary-directory", default="/tmp",
-                    help="directory path where temporary data should "
-                    "be stored")
-parser.add_argument("--no-properties-on-edges", action="store_true",
-                    help="disable properties on edges")
+    description="Memgraph benchmark executor.", formatter_class=argparse.ArgumentDefaultsHelpFormatter
+)
+parser.add_argument(
+    "benchmarks",
+    nargs="*",
+    default="",
+    help="descriptions of benchmarks that should be run; "
+    "multiple descriptions can be specified to run multiple "
+    "benchmarks; the description is specified as "
+    "dataset/variant/group/test; Unix shell-style wildcards "
+    "can be used in the descriptions; variant, group and test "
+    "are optional and they can be left out; the default "
+    "variant is '' which selects the default dataset variant; "
+    "the default group is '*' which selects all groups; the "
+    "default test is '*' which selects all tests",
+)
+parser.add_argument(
+    "--memgraph-binary", default=helpers.get_binary_path("memgraph"), help="Memgraph binary used for benchmarking"
+)
+parser.add_argument(
+    "--client-binary",
+    default=helpers.get_binary_path("tests/mgbench/client"),
+    help="client binary used for benchmarking",
+)
+parser.add_argument(
+    "--num-workers-for-import",
+    type=int,
+    default=multiprocessing.cpu_count() // 2,
+    help="number of workers used to import the dataset",
+)
+parser.add_argument(
+    "--num-workers-for-benchmark", type=int, default=1, help="number of workers used to execute the benchmark"
+)
+parser.add_argument("--single-threaded-runtime-sec", type=int, default=10, help="single threaded duration of each test")
+parser.add_argument("--no-load-query-counts", action="store_true", help="disable loading of cached query counts")
+parser.add_argument("--no-save-query-counts", action="store_true", help="disable storing of cached query counts")
+parser.add_argument("--export-results", default="", help="file path into which results should be exported")
+parser.add_argument(
+    "--temporary-directory", default="/tmp", help="directory path where temporary data should " "be stored"
+)
+parser.add_argument("--no-properties-on-edges", action="store_true", help="disable properties on edges")
+parser.add_argument("--datasets", default="datasets", help="datasets to scan")
+parser.add_argument("--datasets-path", default=".", help="path to datasets to scan")
 args = parser.parse_args()
 
+sys.path.append(args.datasets_path)
+dataset_to_use = importlib.import_module(args.datasets)
+
 # Detect available datasets.
 generators = {}
-for key in dir(datasets):
+for key in dir(dataset_to_use):
     if key.startswith("_"):
         continue
-    dataset = getattr(datasets, key)
-    if not inspect.isclass(dataset) or dataset == datasets.Dataset or \
-            not issubclass(dataset, datasets.Dataset):
+    dataset = getattr(dataset_to_use, key)
+    if not inspect.isclass(dataset) or dataset == datasets.Dataset or not issubclass(dataset, datasets.Dataset):
         continue
     tests = collections.defaultdict(list)
     for funcname in dir(dataset):
@@ -135,8 +141,9 @@ for key in dir(datasets):
         tests[group].append((test, funcname))
     generators[dataset.NAME] = (dataset, dict(tests))
     if dataset.PROPERTIES_ON_EDGES and args.no_properties_on_edges:
-        raise Exception("The \"{}\" dataset requires properties on edges, "
-                        "but you have disabled them!".format(dataset.NAME))
+        raise Exception(
+            'The "{}" dataset requires properties on edges, ' "but you have disabled them!".format(dataset.NAME)
+        )
 
 # List datasets if there is no specified dataset.
 if len(args.benchmarks) == 0:
@@ -144,8 +151,7 @@ if len(args.benchmarks) == 0:
     for name in sorted(generators.keys()):
         print("Dataset:", name)
         dataset, tests = generators[name]
-        print("    Variants:", ", ".join(dataset.VARIANTS),
-              "(default: " + dataset.DEFAULT_VARIANT + ")")
+        print("    Variants:", ", ".join(dataset.VARIANTS), "(default: " + dataset.DEFAULT_VARIANT + ")")
         for group in sorted(tests.keys()):
             print("    Group:", group)
             for test_name, test_func in tests[group]:
@@ -165,31 +171,38 @@ benchmarks = filter_benchmarks(generators, args.benchmarks)
 
 # Run all specified benchmarks.
 for dataset, tests in benchmarks:
-    log.init("Preparing", dataset.NAME + "/" + dataset.get_variant(),
-             "dataset")
-    dataset.prepare(cache.cache_directory("datasets", dataset.NAME,
-                                          dataset.get_variant()))
+    log.init("Preparing", dataset.NAME + "/" + dataset.get_variant(), "dataset")
+    dataset.prepare(cache.cache_directory("datasets", dataset.NAME, dataset.get_variant()))
 
     # Prepare runners and import the dataset.
-    memgraph = runners.Memgraph(args.memgraph_binary, args.temporary_directory,
-                                not args.no_properties_on_edges)
+    memgraph = runners.Memgraph(args.memgraph_binary, args.temporary_directory, not args.no_properties_on_edges)
     client = runners.Client(args.client_binary, args.temporary_directory)
     memgraph.start_preparation()
-    ret = client.execute(file_path=dataset.get_file(),
-                         num_workers=args.num_workers_for_import)
+    ret = client.execute(file_path=dataset.get_file(), num_workers=args.num_workers_for_import)
     usage = memgraph.stop()
 
     # Display import statistics.
     print()
     for row in ret:
-        print("Executed", row["count"], "queries in", row["duration"],
-              "seconds using", row["num_workers"],
-              "workers with a total throughput of", row["throughput"],
-              "queries/second.")
+        print(
+            "Executed",
+            row["count"],
+            "queries in",
+            row["duration"],
+            "seconds using",
+            row["num_workers"],
+            "workers with a total throughput of",
+            row["throughput"],
+            "queries/second.",
+        )
     print()
-    print("The database used", usage["cpu"],
-          "seconds of CPU time and peaked at",
-          usage["memory"] / 1024 / 1024, "MiB of RAM.")
+    print(
+        "The database used",
+        usage["cpu"],
+        "seconds of CPU time and peaked at",
+        usage["memory"] / 1024 / 1024,
+        "MiB of RAM.",
+    )
 
     # Save import results.
     import_key = [dataset.NAME, dataset.get_variant(), "__import__"]
@@ -208,24 +221,26 @@ for dataset, tests in benchmarks:
             config_key = [dataset.NAME, dataset.get_variant(), group, test]
             cached_count = config.get_value(*config_key)
             if cached_count is None:
-                print("Determining the number of queries necessary for",
-                      args.single_threaded_runtime_sec,
-                      "seconds of single-threaded runtime...")
+                print(
+                    "Determining the number of queries necessary for",
+                    args.single_threaded_runtime_sec,
+                    "seconds of single-threaded runtime...",
+                )
                 # First run to prime the query caches.
                 memgraph.start_benchmark()
                 client.execute(queries=get_queries(func, 1), num_workers=1)
                 # Get a sense of the runtime.
                 count = 1
                 while True:
-                    ret = client.execute(queries=get_queries(func, count),
-                                         num_workers=1)
+                    ret = client.execute(queries=get_queries(func, count), num_workers=1)
                     duration = ret[0]["duration"]
-                    should_execute = int(args.single_threaded_runtime_sec /
-                                         (duration / count))
-                    print("executed_queries={}, total_duration={}, "
-                          "query_duration={}, estimated_count={}".format(
-                              count, duration, duration / count,
-                              should_execute))
+                    should_execute = int(args.single_threaded_runtime_sec / (duration / count))
+                    print(
+                        "executed_queries={}, total_duration={}, "
+                        "query_duration={}, estimated_count={}".format(
+                            count, duration, duration / count, should_execute
+                        )
+                    )
                     # We don't have to execute the next iteration when
                     # `should_execute` becomes the same order of magnitude as
                     # `count * 10`.
@@ -235,45 +250,45 @@ for dataset, tests in benchmarks:
                     else:
                         count = count * 10
                 memgraph.stop()
-                config.set_value(*config_key, value={
-                    "count": count,
-                    "duration": args.single_threaded_runtime_sec})
+                config.set_value(*config_key, value={"count": count, "duration": args.single_threaded_runtime_sec})
             else:
-                print("Using cached query count of", cached_count["count"],
-                      "queries for", cached_count["duration"],
-                      "seconds of single-threaded runtime.")
-                count = int(cached_count["count"] *
-                            args.single_threaded_runtime_sec /
-                            cached_count["duration"])
+                print(
+                    "Using cached query count of",
+                    cached_count["count"],
+                    "queries for",
+                    cached_count["duration"],
+                    "seconds of single-threaded runtime.",
+                )
+                count = int(cached_count["count"] * args.single_threaded_runtime_sec / cached_count["duration"])
 
             # Benchmark run.
             print("Sample query:", get_queries(func, 1)[0][0])
-            print("Executing benchmark with", count, "queries that should "
-                  "yield a single-threaded runtime of",
-                  args.single_threaded_runtime_sec, "seconds.")
-            print("Queries are executed using", args.num_workers_for_benchmark,
-                  "concurrent clients.")
+            print(
+                "Executing benchmark with",
+                count,
+                "queries that should " "yield a single-threaded runtime of",
+                args.single_threaded_runtime_sec,
+                "seconds.",
+            )
+            print("Queries are executed using", args.num_workers_for_benchmark, "concurrent clients.")
             memgraph.start_benchmark()
-            ret = client.execute(queries=get_queries(func, count),
-                                 num_workers=args.num_workers_for_benchmark)[0]
+            ret = client.execute(queries=get_queries(func, count), num_workers=args.num_workers_for_benchmark)[0]
             usage = memgraph.stop()
             ret["database"] = usage
 
             # Output summary.
             print()
-            print("Executed", ret["count"], "queries in",
-                  ret["duration"], "seconds.")
+            print("Executed", ret["count"], "queries in", ret["duration"], "seconds.")
             print("Queries have been retried", ret["retries"], "times.")
-            print("Database used {:.3f} seconds of CPU time.".format(
-                  usage["cpu"]))
-            print("Database peaked at {:.3f} MiB of memory.".format(
-                  usage["memory"] / 1024.0 / 1024.0))
-            print("{:<31} {:>20} {:>20} {:>20}".format("Metadata:", "min",
-                                                       "avg", "max"))
+            print("Database used {:.3f} seconds of CPU time.".format(usage["cpu"]))
+            print("Database peaked at {:.3f} MiB of memory.".format(usage["memory"] / 1024.0 / 1024.0))
+            print("{:<31} {:>20} {:>20} {:>20}".format("Metadata:", "min", "avg", "max"))
             metadata = ret["metadata"]
             for key in sorted(metadata.keys()):
-                print("{name:>30}: {minimum:>20.06f} {average:>20.06f} "
-                      "{maximum:>20.06f}".format(name=key, **metadata[key]))
+                print(
+                    "{name:>30}: {minimum:>20.06f} {average:>20.06f} "
+                    "{maximum:>20.06f}".format(name=key, **metadata[key])
+                )
             log.success("Throughput: {:02f} QPS".format(ret["throughput"]))
 
             # Save results.

From 1f778ba5f3b9d146f50f8ffd5ef7bc7b62b18f39 Mon Sep 17 00:00:00 2001
From: jeremy <jeremy.bailleux@memgraph.io>
Date: Tue, 1 Nov 2022 14:14:55 +0100
Subject: [PATCH 17/38] Add possibility to give extra tests arg to MGBench

---
 tests/mgbench/benchmark.py |  7 ++++++-
 tests/mgbench/runners.py   | 34 +++++++++++++++++-----------------
 2 files changed, 23 insertions(+), 18 deletions(-)

diff --git a/tests/mgbench/benchmark.py b/tests/mgbench/benchmark.py
index 498f04f44..b5f1abee0 100755
--- a/tests/mgbench/benchmark.py
+++ b/tests/mgbench/benchmark.py
@@ -26,6 +26,7 @@ import log
 import helpers
 import runners
 import importlib
+import time
 
 
 def get_queries(gen, count):
@@ -120,6 +121,7 @@ parser.add_argument(
 parser.add_argument("--no-properties-on-edges", action="store_true", help="disable properties on edges")
 parser.add_argument("--datasets", default="datasets", help="datasets to scan")
 parser.add_argument("--datasets-path", default=".", help="path to datasets to scan")
+parser.add_argument("--test-system-args", default="")
 args = parser.parse_args()
 
 sys.path.append(args.datasets_path)
@@ -175,9 +177,12 @@ for dataset, tests in benchmarks:
     dataset.prepare(cache.cache_directory("datasets", dataset.NAME, dataset.get_variant()))
 
     # Prepare runners and import the dataset.
-    memgraph = runners.Memgraph(args.memgraph_binary, args.temporary_directory, not args.no_properties_on_edges)
+    memgraph = runners.Memgraph(
+        args.memgraph_binary, args.temporary_directory, not args.no_properties_on_edges, args.test_system_args
+    )
     client = runners.Client(args.client_binary, args.temporary_directory)
     memgraph.start_preparation()
+    time.sleep(5.0)  # giving enough time to machine manager and all to start up
     ret = client.execute(file_path=dataset.get_file(), num_workers=args.num_workers_for_import)
     usage = memgraph.stop()
 
diff --git a/tests/mgbench/runners.py b/tests/mgbench/runners.py
index 891a7cddd..067a58006 100644
--- a/tests/mgbench/runners.py
+++ b/tests/mgbench/runners.py
@@ -40,8 +40,7 @@ def _convert_args_to_flags(*args, **kwargs):
 def _get_usage(pid):
     total_cpu = 0
     with open("/proc/{}/stat".format(pid)) as f:
-        total_cpu = (sum(map(int, f.read().split(")")[1].split()[11:15])) /
-                     os.sysconf(os.sysconf_names["SC_CLK_TCK"]))
+        total_cpu = sum(map(int, f.read().split(")")[1].split()[11:15])) / os.sysconf(os.sysconf_names["SC_CLK_TCK"])
     peak_rss = 0
     with open("/proc/{}/status".format(pid)) as f:
         for row in f:
@@ -52,18 +51,17 @@ def _get_usage(pid):
 
 
 class Memgraph:
-    def __init__(self, memgraph_binary, temporary_dir, properties_on_edges):
+    def __init__(self, memgraph_binary, temporary_dir, properties_on_edges, extra_args):
         self._memgraph_binary = memgraph_binary
         self._directory = tempfile.TemporaryDirectory(dir=temporary_dir)
         self._properties_on_edges = properties_on_edges
         self._proc_mg = None
+        self._extra_args = extra_args
         atexit.register(self._cleanup)
 
         # Determine Memgraph version
-        ret = subprocess.run([memgraph_binary, "--version"],
-                             stdout=subprocess.PIPE, check=True)
-        version = re.search(r"[0-9]+\.[0-9]+\.[0-9]+",
-                            ret.stdout.decode("utf-8")).group(0)
+        ret = subprocess.run([memgraph_binary, "--version"], stdout=subprocess.PIPE, check=True)
+        version = re.search(r"[0-9]+\.[0-9]+\.[0-9]+", ret.stdout.decode("utf-8")).group(0)
         self._memgraph_version = tuple(map(int, version.split(".")))
 
     def __del__(self):
@@ -79,8 +77,14 @@ class Memgraph:
         if self._memgraph_version >= (0, 50, 0):
             kwargs["storage_properties_on_edges"] = self._properties_on_edges
         else:
-            assert self._properties_on_edges, \
-                "Older versions of Memgraph can't disable properties on edges!"
+            assert self._properties_on_edges, "Older versions of Memgraph can't disable properties on edges!"
+
+        if self._extra_args != "":
+            args_list = self._extra_args.split(" ")
+            assert len(args_list) % 2 == 0
+            for i in range(0, len(args_list) // 2):
+                kwargs[args_list[i]] = args_list[i + 1]
+
         return _convert_args_to_flags(self._memgraph_binary, **kwargs)
 
     def _start(self, **kwargs):
@@ -94,8 +98,7 @@ class Memgraph:
             raise Exception("The database process died prematurely!")
         wait_for_server(7687)
         ret = self._proc_mg.poll()
-        assert ret is None, "The database process died prematurely " \
-            "({})!".format(ret)
+        assert ret is None, "The database process died prematurely " "({})!".format(ret)
 
     def _cleanup(self):
         if self._proc_mg is None:
@@ -121,8 +124,7 @@ class Memgraph:
 
     def stop(self):
         ret, usage = self._cleanup()
-        assert ret == 0, "The database process exited with a non-zero " \
-            "status ({})!".format(ret)
+        assert ret == 0, "The database process exited with a non-zero " "status ({})!".format(ret)
         return usage
 
 
@@ -135,8 +137,7 @@ class Client:
         return _convert_args_to_flags(self._client_binary, **kwargs)
 
     def execute(self, queries=None, file_path=None, num_workers=1):
-        if (queries is None and file_path is None) or \
-                (queries is not None and file_path is not None):
+        if (queries is None and file_path is None) or (queries is not None and file_path is not None):
             raise ValueError("Either queries or input_path must be specified!")
 
         # TODO: check `file_path.endswith(".json")` to support advanced
@@ -151,8 +152,7 @@ class Client:
                     json.dump(query, f)
                     f.write("\n")
 
-        args = self._get_args(input=file_path, num_workers=num_workers,
-                              queries_json=queries_json)
+        args = self._get_args(input=file_path, num_workers=num_workers, queries_json=queries_json)
         ret = subprocess.run(args, stdout=subprocess.PIPE, check=True)
         data = ret.stdout.decode("utf-8").strip().split("\n")
         return list(map(json.loads, data))

From 1148fe9aad61251a9f4a968d96aa54c3771e3a44 Mon Sep 17 00:00:00 2001
From: Jeremy B <97525434+42jeremy@users.noreply.github.com>
Date: Wed, 2 Nov 2022 14:13:40 +0100
Subject: [PATCH 18/38] Update tests/mgbench/datasets.py

Co-authored-by: Jure Bajic <jure.bajic@memgraph.com>
---
 tests/mgbench/datasets.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/mgbench/datasets.py b/tests/mgbench/datasets.py
index 40a76b96f..3a5806629 100644
--- a/tests/mgbench/datasets.py
+++ b/tests/mgbench/datasets.py
@@ -349,7 +349,7 @@ class AccessControl(Dataset):
 
     def __init__(self, variant=None):
         super().__init__(variant)
-        self.next_value_idx = Dataset.get_size(self)["vertices"] + 1
+        self.next_value_idx = self.get_size()["vertices"] + 1
 
     def benchmark__create__vertex(self):
         self.next_value_idx += 1

From edeebf46ec8efced3fb1d5949289717532307a6e Mon Sep 17 00:00:00 2001
From: Jeremy B <97525434+42jeremy@users.noreply.github.com>
Date: Wed, 2 Nov 2022 14:13:58 +0100
Subject: [PATCH 19/38] Update tests/mgbench/benchmark.py

Co-authored-by: Jure Bajic <jure.bajic@memgraph.com>
---
 tests/mgbench/benchmark.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/mgbench/benchmark.py b/tests/mgbench/benchmark.py
index b5f1abee0..6458447d0 100755
--- a/tests/mgbench/benchmark.py
+++ b/tests/mgbench/benchmark.py
@@ -271,7 +271,7 @@ for dataset, tests in benchmarks:
             print(
                 "Executing benchmark with",
                 count,
-                "queries that should " "yield a single-threaded runtime of",
+                "queries that should yield a single-threaded runtime of",
                 args.single_threaded_runtime_sec,
                 "seconds.",
             )

From 1d18f1197fcd14ef30ca9db92f8b9f7f8b8f102b Mon Sep 17 00:00:00 2001
From: Jeremy B <97525434+42jeremy@users.noreply.github.com>
Date: Wed, 2 Nov 2022 14:14:16 +0100
Subject: [PATCH 20/38] Update tests/mgbench/dataset_creator.py

Co-authored-by: Jure Bajic <jure.bajic@memgraph.com>
---
 tests/mgbench/dataset_creator.py | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/tests/mgbench/dataset_creator.py b/tests/mgbench/dataset_creator.py
index 991135cdd..d2b4a67d9 100644
--- a/tests/mgbench/dataset_creator.py
+++ b/tests/mgbench/dataset_creator.py
@@ -67,18 +67,18 @@ def main():
     assert percentage_of_permissions > 0.0 and percentage_of_permissions <= 1.0
     assert filename != ""
 
-    f = open(filename, "w")
+    with open(filename, "w") as f:
 
-    f.write("MATCH (n) DETACH DELETE n;\n")
+      f.write("MATCH (n) DETACH DELETE n;\n")
 
-    # Create the indexes
-    f.write("CREATE INDEX ON :File;\n")
-    f.write("CREATE INDEX ON :Permission;\n")
-    f.write("CREATE INDEX ON :Identity;\n")
-    f.write("CREATE INDEX ON :File(platformId);\n")
-    f.write("CREATE INDEX ON :File(name);\n")
-    f.write("CREATE INDEX ON :Permission(name);\n")
-    f.write("CREATE INDEX ON :Identity(email);\n")
+      # Create the indexes
+      f.write("CREATE INDEX ON :File;\n")
+      f.write("CREATE INDEX ON :Permission;\n")
+      f.write("CREATE INDEX ON :Identity;\n")
+      f.write("CREATE INDEX ON :File(platformId);\n")
+      f.write("CREATE INDEX ON :File(name);\n")
+      f.write("CREATE INDEX ON :Permission(name);\n")
+      f.write("CREATE INDEX ON :Identity(email);\n")
 
     # Create extra index: in distributed, this will be the schema
     f.write("CREATE INDEX ON :File(uuid);\n")

From e909e7d2d8cc64770ade67d8f92cb08c8150ad9c Mon Sep 17 00:00:00 2001
From: jeremy <jeremy.bailleux@memgraph.io>
Date: Wed, 2 Nov 2022 14:18:04 +0100
Subject: [PATCH 21/38] Format

---
 tests/mgbench/dataset_creator.py | 75 +++++++++++++++-----------------
 1 file changed, 36 insertions(+), 39 deletions(-)

diff --git a/tests/mgbench/dataset_creator.py b/tests/mgbench/dataset_creator.py
index d2b4a67d9..72b773593 100644
--- a/tests/mgbench/dataset_creator.py
+++ b/tests/mgbench/dataset_creator.py
@@ -68,52 +68,49 @@ def main():
     assert filename != ""
 
     with open(filename, "w") as f:
+        f.write("MATCH (n) DETACH DELETE n;\n")
 
-      f.write("MATCH (n) DETACH DELETE n;\n")
+        # Create the indexes
+        f.write("CREATE INDEX ON :File;\n")
+        f.write("CREATE INDEX ON :Permission;\n")
+        f.write("CREATE INDEX ON :Identity;\n")
+        f.write("CREATE INDEX ON :File(platformId);\n")
+        f.write("CREATE INDEX ON :File(name);\n")
+        f.write("CREATE INDEX ON :Permission(name);\n")
+        f.write("CREATE INDEX ON :Identity(email);\n")
 
-      # Create the indexes
-      f.write("CREATE INDEX ON :File;\n")
-      f.write("CREATE INDEX ON :Permission;\n")
-      f.write("CREATE INDEX ON :Identity;\n")
-      f.write("CREATE INDEX ON :File(platformId);\n")
-      f.write("CREATE INDEX ON :File(name);\n")
-      f.write("CREATE INDEX ON :Permission(name);\n")
-      f.write("CREATE INDEX ON :Identity(email);\n")
+        # Create extra index: in distributed, this will be the schema
+        f.write("CREATE INDEX ON :File(uuid);\n")
+        f.write("CREATE INDEX ON :Permission(uuid);\n")
+        f.write("CREATE INDEX ON :Identity(uuid);\n")
 
-    # Create extra index: in distributed, this will be the schema
-    f.write("CREATE INDEX ON :File(uuid);\n")
-    f.write("CREATE INDEX ON :Permission(uuid);\n")
-    f.write("CREATE INDEX ON :Identity(uuid);\n")
+        uuid = 1
 
-    uuid = 1
+        # Create the nodes File
+        for index in range(0, number_of_files):
+            f.write(f'CREATE (:File {{uuid: {uuid}, platformId: "platform_id", name: "name_file_{uuid}"}});\n')
+            uuid += 1
 
-    # Create the nodes File
-    for index in range(0, number_of_files):
-        f.write(f'CREATE (:File {{uuid: {uuid}, platformId: "platform_id", name: "name_file_{uuid}"}});\n')
-        uuid += 1
+        identities = []
+        # Create the nodes Identity
+        for index in range(0, number_of_identities):
+            f.write(f'CREATE (:Identity {{uuid: {uuid}, name: "mail_{uuid}@something.com"}});\n')
+            uuid += 1
 
-    identities = []
-    # Create the nodes Identity
-    for index in range(0, number_of_identities):
-        f.write(f'CREATE (:Identity {{uuid: {uuid}, name: "mail_{uuid}@something.com"}});\n')
-        uuid += 1
+        for outer_index in range(0, number_of_files):
+            for inner_index in range(0, number_of_identities):
+                file_uuid = outer_index + 1
+                identity_uuid = number_of_files + inner_index + 1
 
-    for outer_index in range(0, number_of_files):
-        for inner_index in range(0, number_of_identities):
-            file_uuid = outer_index + 1
-            identity_uuid = number_of_files + inner_index + 1
-
-            if random.random() <= percentage_of_permissions:
-                f.write(f'CREATE (:Permission {{uuid: {uuid}, name: "name_permission_{uuid}"}});\n')
-                f.write(
-                    f"MATCH (permission:Permission {{uuid: {uuid}}}), (file:File {{uuid: {file_uuid}}}) CREATE (permission)-[e: IS_FOR_FILE]->(file);\n"
-                )
-                f.write(
-                    f"MATCH (permission:Permission {{uuid: {uuid}}}), (identity:Identity {{uuid: {identity_uuid}}}) CREATE (permission)-[e: IS_FOR_IDENTITY]->(identity);\n"
-                )
-                uuid += 1
-
-    f.close()
+                if random.random() <= percentage_of_permissions:
+                    f.write(f'CREATE (:Permission {{uuid: {uuid}, name: "name_permission_{uuid}"}});\n')
+                    f.write(
+                        f"MATCH (permission:Permission {{uuid: {uuid}}}), (file:File {{uuid: {file_uuid}}}) CREATE (permission)-[e: IS_FOR_FILE]->(file);\n"
+                    )
+                    f.write(
+                        f"MATCH (permission:Permission {{uuid: {uuid}}}), (identity:Identity {{uuid: {identity_uuid}}}) CREATE (permission)-[e: IS_FOR_IDENTITY]->(identity);\n"
+                    )
+                    uuid += 1
 
 
 if __name__ == "__main__":

From 70dc19dfdb4e4ef7f7acd79f1aee6ef2942e15db Mon Sep 17 00:00:00 2001
From: jeremy <jeremy.bailleux@memgraph.io>
Date: Thu, 3 Nov 2022 11:03:21 +0100
Subject: [PATCH 22/38] Mgbench: apply filtering on results from client

---
 tests/mgbench/runners.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/mgbench/runners.py b/tests/mgbench/runners.py
index 067a58006..363595c69 100644
--- a/tests/mgbench/runners.py
+++ b/tests/mgbench/runners.py
@@ -155,4 +155,5 @@ class Client:
         args = self._get_args(input=file_path, num_workers=num_workers, queries_json=queries_json)
         ret = subprocess.run(args, stdout=subprocess.PIPE, check=True)
         data = ret.stdout.decode("utf-8").strip().split("\n")
+        data = [x for x in data if not x.startswith("[")]
         return list(map(json.loads, data))

From 14e3e725658aa046efe071d09cb3ffc870d97c29 Mon Sep 17 00:00:00 2001
From: jeremy <jeremy.bailleux@memgraph.io>
Date: Fri, 4 Nov 2022 08:52:47 +0100
Subject: [PATCH 23/38] Correct badly written range

---
 tests/mgbench/runners.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/mgbench/runners.py b/tests/mgbench/runners.py
index 363595c69..2b69a811f 100644
--- a/tests/mgbench/runners.py
+++ b/tests/mgbench/runners.py
@@ -82,7 +82,7 @@ class Memgraph:
         if self._extra_args != "":
             args_list = self._extra_args.split(" ")
             assert len(args_list) % 2 == 0
-            for i in range(0, len(args_list) // 2):
+            for i in range(0, len(args_list), 2):
                 kwargs[args_list[i]] = args_list[i + 1]
 
         return _convert_args_to_flags(self._memgraph_binary, **kwargs)

From e41073bc2ca83b768a58f251e3802841ba573a93 Mon Sep 17 00:00:00 2001
From: jeremy <jeremy.bailleux@memgraph.io>
Date: Fri, 4 Nov 2022 09:17:09 +0100
Subject: [PATCH 24/38] Update script to need single argument for local dataset

---
 tests/mgbench/benchmark.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/tests/mgbench/benchmark.py b/tests/mgbench/benchmark.py
index 6458447d0..f6b0a19ca 100755
--- a/tests/mgbench/benchmark.py
+++ b/tests/mgbench/benchmark.py
@@ -27,6 +27,7 @@ import helpers
 import runners
 import importlib
 import time
+import os
 
 
 def get_queries(gen, count):
@@ -119,13 +120,15 @@ parser.add_argument(
     "--temporary-directory", default="/tmp", help="directory path where temporary data should " "be stored"
 )
 parser.add_argument("--no-properties-on-edges", action="store_true", help="disable properties on edges")
-parser.add_argument("--datasets", default="datasets", help="datasets to scan")
 parser.add_argument("--datasets-path", default=".", help="path to datasets to scan")
 parser.add_argument("--test-system-args", default="")
 args = parser.parse_args()
 
-sys.path.append(args.datasets_path)
-dataset_to_use = importlib.import_module(args.datasets)
+head_tail = os.path.split(args.datasets_path)
+path_without_dataset_name = head_tail[0]
+dataset_name = head_tail[1]
+sys.path.append(path_without_dataset_name)
+dataset_to_use = importlib.import_module(dataset_name)
 
 # Detect available datasets.
 generators = {}

From 9e72c7cb54e1186baa0a1464a1bdb5bc4f938a85 Mon Sep 17 00:00:00 2001
From: jeremy <jeremy.bailleux@memgraph.io>
Date: Fri, 4 Nov 2022 15:57:26 +0100
Subject: [PATCH 25/38] Add extra safe check to in case we call on dataset.py

---
 tests/mgbench/benchmark.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/mgbench/benchmark.py b/tests/mgbench/benchmark.py
index f6b0a19ca..619723caf 100755
--- a/tests/mgbench/benchmark.py
+++ b/tests/mgbench/benchmark.py
@@ -126,7 +126,7 @@ args = parser.parse_args()
 
 head_tail = os.path.split(args.datasets_path)
 path_without_dataset_name = head_tail[0]
-dataset_name = head_tail[1]
+dataset_name = head_tail[1].split(".")[0]
 sys.path.append(path_without_dataset_name)
 dataset_to_use = importlib.import_module(dataset_name)
 

From d17970f6d9dba056e3f06022de3849d93667cab6 Mon Sep 17 00:00:00 2001
From: jeremy <jeremy.bailleux@memgraph.io>
Date: Fri, 4 Nov 2022 16:04:45 +0100
Subject: [PATCH 26/38] Update default value for --datasets

---
 tests/mgbench/benchmark.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/mgbench/benchmark.py b/tests/mgbench/benchmark.py
index 619723caf..44eb2290d 100755
--- a/tests/mgbench/benchmark.py
+++ b/tests/mgbench/benchmark.py
@@ -120,7 +120,7 @@ parser.add_argument(
     "--temporary-directory", default="/tmp", help="directory path where temporary data should " "be stored"
 )
 parser.add_argument("--no-properties-on-edges", action="store_true", help="disable properties on edges")
-parser.add_argument("--datasets-path", default=".", help="path to datasets to scan")
+parser.add_argument("--datasets-path", default="datasets", help="path to datasets to scan")
 parser.add_argument("--test-system-args", default="")
 args = parser.parse_args()
 

From 5273d319e2316aadb62b6754dec68c8b90e58d4b Mon Sep 17 00:00:00 2001
From: jeremy <jeremy.bailleux@memgraph.io>
Date: Mon, 7 Nov 2022 09:53:29 +0100
Subject: [PATCH 27/38] Add split file for access control

---
 .../accesscontrol_large.shard_configuration   | 36 +++++++++++++++++++
 .../accesscontrol_medium.shard_configuration  | 36 +++++++++++++++++++
 .../accesscontrol_small.shard_configuration   | 36 +++++++++++++++++++
 3 files changed, 108 insertions(+)
 create mode 100644 tests/mgbench/splitfiles/accesscontrol_large.shard_configuration
 create mode 100644 tests/mgbench/splitfiles/accesscontrol_medium.shard_configuration
 create mode 100644 tests/mgbench/splitfiles/accesscontrol_small.shard_configuration

diff --git a/tests/mgbench/splitfiles/accesscontrol_large.shard_configuration b/tests/mgbench/splitfiles/accesscontrol_large.shard_configuration
new file mode 100644
index 000000000..b7ce91b58
--- /dev/null
+++ b/tests/mgbench/splitfiles/accesscontrol_large.shard_configuration
@@ -0,0 +1,36 @@
+4
+uuid
+email
+name
+platformId
+2
+IS_FOR_IDENTITY
+IS_FOR_FILE│
+3
+File
+1
+uuid
+string
+1
+[1]
+Identity
+1
+uuid
+string
+1
+[10001]
+Permission
+1
+uuid
+string
+10
+[20001]
+[10020000]
+[20002000]
+[30002000]
+[40002000]
+[50002000]
+[60002000]
+[70002000]
+[80002000]
+[90002000]
diff --git a/tests/mgbench/splitfiles/accesscontrol_medium.shard_configuration b/tests/mgbench/splitfiles/accesscontrol_medium.shard_configuration
new file mode 100644
index 000000000..ff01a53d2
--- /dev/null
+++ b/tests/mgbench/splitfiles/accesscontrol_medium.shard_configuration
@@ -0,0 +1,36 @@
+4
+uuid
+email
+name
+platformId
+2
+IS_FOR_IDENTITY
+IS_FOR_FILE│
+3
+File
+1
+uuid
+string
+1
+[1]
+Identity
+1
+uuid
+string
+1
+[1001]
+Permission
+1
+uuid
+string
+10
+[2001]
+[102000]
+[202000]
+[302000]
+[402000]
+[502000]
+[602000]
+[702000]
+[802000]
+[902000]
diff --git a/tests/mgbench/splitfiles/accesscontrol_small.shard_configuration b/tests/mgbench/splitfiles/accesscontrol_small.shard_configuration
new file mode 100644
index 000000000..101c40cca
--- /dev/null
+++ b/tests/mgbench/splitfiles/accesscontrol_small.shard_configuration
@@ -0,0 +1,36 @@
+4
+uuid
+email
+name
+platformId
+2
+IS_FOR_IDENTITY
+IS_FOR_FILE│
+3
+File
+1
+uuid
+string
+1
+[1]
+Identity
+1
+uuid
+string
+1
+[11]
+Permission
+1
+uuid
+string
+10
+[21]
+[31]
+[41]
+[51]
+[61]
+[71]
+[81]
+[91]
+[100]
+[110]

From baacc52a65fbb373bca79a390a7334abb0580113 Mon Sep 17 00:00:00 2001
From: jeremy <jeremy.bailleux@memgraph.io>
Date: Mon, 7 Nov 2022 09:54:14 +0100
Subject: [PATCH 28/38] Add support for split file configuration

---
 tests/mgbench/benchmark.py |  6 +++++-
 tests/mgbench/datasets.py  | 15 +++++++++++++++
 tests/mgbench/runners.py   |  5 ++++-
 3 files changed, 24 insertions(+), 2 deletions(-)

diff --git a/tests/mgbench/benchmark.py b/tests/mgbench/benchmark.py
index 44eb2290d..40760f63e 100755
--- a/tests/mgbench/benchmark.py
+++ b/tests/mgbench/benchmark.py
@@ -181,7 +181,11 @@ for dataset, tests in benchmarks:
 
     # Prepare runners and import the dataset.
     memgraph = runners.Memgraph(
-        args.memgraph_binary, args.temporary_directory, not args.no_properties_on_edges, args.test_system_args
+        args.memgraph_binary,
+        args.temporary_directory,
+        not args.no_properties_on_edges,
+        args.test_system_args,
+        dataset.get_split_file(),
     )
     client = runners.Client(args.client_binary, args.temporary_directory)
     memgraph.start_preparation()
diff --git a/tests/mgbench/datasets.py b/tests/mgbench/datasets.py
index 3a5806629..e73169aab 100644
--- a/tests/mgbench/datasets.py
+++ b/tests/mgbench/datasets.py
@@ -63,6 +63,10 @@ class Dataset:
             raise ValueError("The size defined for this variant doesn't " "have the number of vertices and/or edges!")
         self._num_vertices = self._size["vertices"]
         self._num_edges = self._size["edges"]
+        if self.SPLIT_FILES is not None:
+            self._split_file = self.SPLIT_FILES.get(variant, None)
+        else:
+            self._split_file = None
 
     def prepare(self, directory):
         if self._file is not None:
@@ -92,6 +96,11 @@ class Dataset:
         """Returns number of vertices/edges for the current variant."""
         return self._size
 
+    def get_split_file(self):
+        """Returns the location of the split file of the dataset."""
+        assert self._split_file is not None
+        return self._split_file
+
     # All tests should be query generator functions that output all of the
     # queries that should be executed by the runner. The functions should be
     # named `benchmark__GROUPNAME__TESTNAME` and should not accept any
@@ -299,6 +308,12 @@ class AccessControl(Dataset):
         "medium": "https://s3.eu-west-1.amazonaws.com/deps.memgraph.io/dataset/accesscontrol/accesscontrol_medium.setup.cypher.gz",
         "large": "https://s3.eu-west-1.amazonaws.com/deps.memgraph.io/dataset/accesscontrol/accesscontrol_large.setup.cypher.gz",
     }
+    SPLIT_FILES = {
+        "empty_only_index": "splitfiles/accesscontrol_small.shard_configuration",
+        "small": "splitfiles/accesscontrol_small.shard_configuration",
+        "medium": "splitfiles/accesscontrol_medium.shard_configuration",
+        "large": "splitfiles/accesscontrol_large.shard_configuration",
+    }
     SIZES = {
         "empty_only_index": {
             "vertices": 0,
diff --git a/tests/mgbench/runners.py b/tests/mgbench/runners.py
index 2b69a811f..acee68e07 100644
--- a/tests/mgbench/runners.py
+++ b/tests/mgbench/runners.py
@@ -51,12 +51,13 @@ def _get_usage(pid):
 
 
 class Memgraph:
-    def __init__(self, memgraph_binary, temporary_dir, properties_on_edges, extra_args):
+    def __init__(self, memgraph_binary, temporary_dir, properties_on_edges, extra_args, split_file):
         self._memgraph_binary = memgraph_binary
         self._directory = tempfile.TemporaryDirectory(dir=temporary_dir)
         self._properties_on_edges = properties_on_edges
         self._proc_mg = None
         self._extra_args = extra_args
+        self._split_file = split_file
         atexit.register(self._cleanup)
 
         # Determine Memgraph version
@@ -85,6 +86,8 @@ class Memgraph:
             for i in range(0, len(args_list), 2):
                 kwargs[args_list[i]] = args_list[i + 1]
 
+        kwargs["split-file"] = self._split_file
+
         return _convert_args_to_flags(self._memgraph_binary, **kwargs)
 
     def _start(self, **kwargs):

From dca94f42bb543daaf25be0939a09dacfb61e2bb8 Mon Sep 17 00:00:00 2001
From: jeremy <jeremy.bailleux@memgraph.io>
Date: Mon, 7 Nov 2022 10:14:49 +0100
Subject: [PATCH 29/38] Update key type in shard configuration

---
 .../splitfiles/accesscontrol_large.shard_configuration      | 6 +++---
 .../splitfiles/accesscontrol_medium.shard_configuration     | 6 +++---
 .../splitfiles/accesscontrol_small.shard_configuration      | 6 +++---
 3 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/tests/mgbench/splitfiles/accesscontrol_large.shard_configuration b/tests/mgbench/splitfiles/accesscontrol_large.shard_configuration
index b7ce91b58..1f5759e0c 100644
--- a/tests/mgbench/splitfiles/accesscontrol_large.shard_configuration
+++ b/tests/mgbench/splitfiles/accesscontrol_large.shard_configuration
@@ -10,19 +10,19 @@ IS_FOR_FILE│
 File
 1
 uuid
-string
+int
 1
 [1]
 Identity
 1
 uuid
-string
+int
 1
 [10001]
 Permission
 1
 uuid
-string
+int
 10
 [20001]
 [10020000]
diff --git a/tests/mgbench/splitfiles/accesscontrol_medium.shard_configuration b/tests/mgbench/splitfiles/accesscontrol_medium.shard_configuration
index ff01a53d2..f346d4b96 100644
--- a/tests/mgbench/splitfiles/accesscontrol_medium.shard_configuration
+++ b/tests/mgbench/splitfiles/accesscontrol_medium.shard_configuration
@@ -10,19 +10,19 @@ IS_FOR_FILE│
 File
 1
 uuid
-string
+int
 1
 [1]
 Identity
 1
 uuid
-string
+int
 1
 [1001]
 Permission
 1
 uuid
-string
+int
 10
 [2001]
 [102000]
diff --git a/tests/mgbench/splitfiles/accesscontrol_small.shard_configuration b/tests/mgbench/splitfiles/accesscontrol_small.shard_configuration
index 101c40cca..86ea13346 100644
--- a/tests/mgbench/splitfiles/accesscontrol_small.shard_configuration
+++ b/tests/mgbench/splitfiles/accesscontrol_small.shard_configuration
@@ -10,19 +10,19 @@ IS_FOR_FILE│
 File
 1
 uuid
-string
+int
 1
 [1]
 Identity
 1
 uuid
-string
+int
 1
 [11]
 Permission
 1
 uuid
-string
+int
 10
 [21]
 [31]

From b10b1eb23982d9f72d6f25c74f7248f6de5c7889 Mon Sep 17 00:00:00 2001
From: jeremy <jeremy.bailleux@memgraph.io>
Date: Mon, 7 Nov 2022 10:33:24 +0100
Subject: [PATCH 30/38] Correct shard configuration

---
 .../mgbench/splitfiles/accesscontrol_large.shard_configuration  | 2 +-
 .../mgbench/splitfiles/accesscontrol_medium.shard_configuration | 2 +-
 .../mgbench/splitfiles/accesscontrol_small.shard_configuration  | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/mgbench/splitfiles/accesscontrol_large.shard_configuration b/tests/mgbench/splitfiles/accesscontrol_large.shard_configuration
index 1f5759e0c..34dca66be 100644
--- a/tests/mgbench/splitfiles/accesscontrol_large.shard_configuration
+++ b/tests/mgbench/splitfiles/accesscontrol_large.shard_configuration
@@ -5,7 +5,7 @@ name
 platformId
 2
 IS_FOR_IDENTITY
-IS_FOR_FILE│
+IS_FOR_FILE
 3
 File
 1
diff --git a/tests/mgbench/splitfiles/accesscontrol_medium.shard_configuration b/tests/mgbench/splitfiles/accesscontrol_medium.shard_configuration
index f346d4b96..a807e783f 100644
--- a/tests/mgbench/splitfiles/accesscontrol_medium.shard_configuration
+++ b/tests/mgbench/splitfiles/accesscontrol_medium.shard_configuration
@@ -5,7 +5,7 @@ name
 platformId
 2
 IS_FOR_IDENTITY
-IS_FOR_FILE│
+IS_FOR_FILE
 3
 File
 1
diff --git a/tests/mgbench/splitfiles/accesscontrol_small.shard_configuration b/tests/mgbench/splitfiles/accesscontrol_small.shard_configuration
index 86ea13346..9c11b6258 100644
--- a/tests/mgbench/splitfiles/accesscontrol_small.shard_configuration
+++ b/tests/mgbench/splitfiles/accesscontrol_small.shard_configuration
@@ -5,7 +5,7 @@ name
 platformId
 2
 IS_FOR_IDENTITY
-IS_FOR_FILE│
+IS_FOR_FILE
 3
 File
 1

From 5201db46d20aedf71b21ecd367e0227cf04c8bf4 Mon Sep 17 00:00:00 2001
From: jeremy <jeremy.bailleux@memgraph.io>
Date: Mon, 7 Nov 2022 12:15:57 +0100
Subject: [PATCH 31/38] Add assert for split_file

---
 tests/mgbench/datasets.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/tests/mgbench/datasets.py b/tests/mgbench/datasets.py
index e73169aab..8722ada1d 100644
--- a/tests/mgbench/datasets.py
+++ b/tests/mgbench/datasets.py
@@ -63,10 +63,9 @@ class Dataset:
             raise ValueError("The size defined for this variant doesn't " "have the number of vertices and/or edges!")
         self._num_vertices = self._size["vertices"]
         self._num_edges = self._size["edges"]
-        if self.SPLIT_FILES is not None:
-            self._split_file = self.SPLIT_FILES.get(variant, None)
-        else:
-            self._split_file = None
+        self._split_file = self.SPLIT_FILES.get(variant, None)
+        assert self._split_file is not None
+        assert self._split_file != ""
 
     def prepare(self, directory):
         if self._file is not None:

From 2a7ed1ad829ad87075450de879a09c3bb2a4c33b Mon Sep 17 00:00:00 2001
From: jeremy <jeremy.bailleux@memgraph.io>
Date: Mon, 7 Nov 2022 12:55:08 +0100
Subject: [PATCH 32/38] Add single e2e benchmark test

---
 .github/workflows/diff.yaml | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/.github/workflows/diff.yaml b/.github/workflows/diff.yaml
index ef5cf2ee2..550452b85 100644
--- a/.github/workflows/diff.yaml
+++ b/.github/workflows/diff.yaml
@@ -173,6 +173,15 @@ jobs:
           cd build
           ctest -R memgraph__simulation --output-on-failure -j$THREADS
 
+      - name: Run single benchmark test
+        run: |
+          # Activate toolchain.
+          source /opt/toolchain-v4/activate
+
+          # Run simulation tests.
+          cd tests/mgbench
+          ./benchmark.py accesscontrol/small --num-workers-for-import 1 --test-system-arg  "bolt-num-workers 1"
+
   release_build:
     name: "Release build"
     runs-on: [self-hosted, Linux, X64, Diff]
@@ -220,6 +229,15 @@ jobs:
           cd build
           ctest -R memgraph__simulation --output-on-failure -j$THREADS
 
+      - name: Run single benchmark test
+        run: |
+          # Activate toolchain.
+          source /opt/toolchain-v4/activate
+
+          # Run simulation tests.
+          cd tests/mgbench
+          ./benchmark.py accesscontrol/small --num-workers-for-import 1 --test-system-arg  "bolt-num-workers 1"
+
       - name: Run e2e tests
         run: |
           # TODO(gitbuda): Setup mgclient and pymgclient properly.

From c16f948de9d27ebeafa4821f1d94536faed5e1f3 Mon Sep 17 00:00:00 2001
From: jeremy <jeremy.bailleux@memgraph.io>
Date: Mon, 7 Nov 2022 13:08:35 +0100
Subject: [PATCH 33/38] Delete cache folder before running benchmark test

---
 .github/workflows/diff.yaml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.github/workflows/diff.yaml b/.github/workflows/diff.yaml
index 550452b85..731e9e816 100644
--- a/.github/workflows/diff.yaml
+++ b/.github/workflows/diff.yaml
@@ -180,6 +180,7 @@ jobs:
 
           # Run simulation tests.
           cd tests/mgbench
+          rm -r .cache
           ./benchmark.py accesscontrol/small --num-workers-for-import 1 --test-system-arg  "bolt-num-workers 1"
 
   release_build:
@@ -236,6 +237,7 @@ jobs:
 
           # Run simulation tests.
           cd tests/mgbench
+          rm -r .cache
           ./benchmark.py accesscontrol/small --num-workers-for-import 1 --test-system-arg  "bolt-num-workers 1"
 
       - name: Run e2e tests

From a54bcb9819bdded10e49ea3aefe3dbd1ba1f1b52 Mon Sep 17 00:00:00 2001
From: jeremy <jeremy.bailleux@memgraph.io>
Date: Mon, 7 Nov 2022 17:32:09 +0100
Subject: [PATCH 34/38] Remove un-necessary rm in workflow

---
 .github/workflows/diff.yaml | 2 --
 1 file changed, 2 deletions(-)

diff --git a/.github/workflows/diff.yaml b/.github/workflows/diff.yaml
index 731e9e816..550452b85 100644
--- a/.github/workflows/diff.yaml
+++ b/.github/workflows/diff.yaml
@@ -180,7 +180,6 @@ jobs:
 
           # Run simulation tests.
           cd tests/mgbench
-          rm -r .cache
           ./benchmark.py accesscontrol/small --num-workers-for-import 1 --test-system-arg  "bolt-num-workers 1"
 
   release_build:
@@ -237,7 +236,6 @@ jobs:
 
           # Run simulation tests.
           cd tests/mgbench
-          rm -r .cache
           ./benchmark.py accesscontrol/small --num-workers-for-import 1 --test-system-arg  "bolt-num-workers 1"
 
       - name: Run e2e tests

From 61b9457718423f045309a730539ae44060e48842 Mon Sep 17 00:00:00 2001
From: jeremy <jeremy.bailleux@memgraph.io>
Date: Tue, 8 Nov 2022 11:35:54 +0100
Subject: [PATCH 35/38] Remove split-files logic from test code

---
 .github/workflows/diff.yaml |  4 ++--
 tests/mgbench/benchmark.py  |  1 -
 tests/mgbench/datasets.py   | 14 --------------
 tests/mgbench/runners.py    |  5 +----
 4 files changed, 3 insertions(+), 21 deletions(-)

diff --git a/.github/workflows/diff.yaml b/.github/workflows/diff.yaml
index 550452b85..41573fc67 100644
--- a/.github/workflows/diff.yaml
+++ b/.github/workflows/diff.yaml
@@ -180,7 +180,7 @@ jobs:
 
           # Run simulation tests.
           cd tests/mgbench
-          ./benchmark.py accesscontrol/small --num-workers-for-import 1 --test-system-arg  "bolt-num-workers 1"
+          ./benchmark.py accesscontrol/small --num-workers-for-import 1 --test-system-arg  "split-file splitfiles/accesscontrol_small.shard_configuration bolt-num-workers 1"
 
   release_build:
     name: "Release build"
@@ -236,7 +236,7 @@ jobs:
 
           # Run simulation tests.
           cd tests/mgbench
-          ./benchmark.py accesscontrol/small --num-workers-for-import 1 --test-system-arg  "bolt-num-workers 1"
+          ./benchmark.py accesscontrol/small --num-workers-for-import 1 --test-system-arg  "split-file splitfiles/accesscontrol_small.shard_configuration bolt-num-workers 1"
 
       - name: Run e2e tests
         run: |
diff --git a/tests/mgbench/benchmark.py b/tests/mgbench/benchmark.py
index 40760f63e..6f37c9570 100755
--- a/tests/mgbench/benchmark.py
+++ b/tests/mgbench/benchmark.py
@@ -185,7 +185,6 @@ for dataset, tests in benchmarks:
         args.temporary_directory,
         not args.no_properties_on_edges,
         args.test_system_args,
-        dataset.get_split_file(),
     )
     client = runners.Client(args.client_binary, args.temporary_directory)
     memgraph.start_preparation()
diff --git a/tests/mgbench/datasets.py b/tests/mgbench/datasets.py
index 8722ada1d..3a5806629 100644
--- a/tests/mgbench/datasets.py
+++ b/tests/mgbench/datasets.py
@@ -63,9 +63,6 @@ class Dataset:
             raise ValueError("The size defined for this variant doesn't " "have the number of vertices and/or edges!")
         self._num_vertices = self._size["vertices"]
         self._num_edges = self._size["edges"]
-        self._split_file = self.SPLIT_FILES.get(variant, None)
-        assert self._split_file is not None
-        assert self._split_file != ""
 
     def prepare(self, directory):
         if self._file is not None:
@@ -95,11 +92,6 @@ class Dataset:
         """Returns number of vertices/edges for the current variant."""
         return self._size
 
-    def get_split_file(self):
-        """Returns the location of the split file of the dataset."""
-        assert self._split_file is not None
-        return self._split_file
-
     # All tests should be query generator functions that output all of the
     # queries that should be executed by the runner. The functions should be
     # named `benchmark__GROUPNAME__TESTNAME` and should not accept any
@@ -307,12 +299,6 @@ class AccessControl(Dataset):
         "medium": "https://s3.eu-west-1.amazonaws.com/deps.memgraph.io/dataset/accesscontrol/accesscontrol_medium.setup.cypher.gz",
         "large": "https://s3.eu-west-1.amazonaws.com/deps.memgraph.io/dataset/accesscontrol/accesscontrol_large.setup.cypher.gz",
     }
-    SPLIT_FILES = {
-        "empty_only_index": "splitfiles/accesscontrol_small.shard_configuration",
-        "small": "splitfiles/accesscontrol_small.shard_configuration",
-        "medium": "splitfiles/accesscontrol_medium.shard_configuration",
-        "large": "splitfiles/accesscontrol_large.shard_configuration",
-    }
     SIZES = {
         "empty_only_index": {
             "vertices": 0,
diff --git a/tests/mgbench/runners.py b/tests/mgbench/runners.py
index acee68e07..2b69a811f 100644
--- a/tests/mgbench/runners.py
+++ b/tests/mgbench/runners.py
@@ -51,13 +51,12 @@ def _get_usage(pid):
 
 
 class Memgraph:
-    def __init__(self, memgraph_binary, temporary_dir, properties_on_edges, extra_args, split_file):
+    def __init__(self, memgraph_binary, temporary_dir, properties_on_edges, extra_args):
         self._memgraph_binary = memgraph_binary
         self._directory = tempfile.TemporaryDirectory(dir=temporary_dir)
         self._properties_on_edges = properties_on_edges
         self._proc_mg = None
         self._extra_args = extra_args
-        self._split_file = split_file
         atexit.register(self._cleanup)
 
         # Determine Memgraph version
@@ -86,8 +85,6 @@ class Memgraph:
             for i in range(0, len(args_list), 2):
                 kwargs[args_list[i]] = args_list[i + 1]
 
-        kwargs["split-file"] = self._split_file
-
         return _convert_args_to_flags(self._memgraph_binary, **kwargs)
 
     def _start(self, **kwargs):

From 33add3ecd067d7beb948a0fdeda6817470efefd3 Mon Sep 17 00:00:00 2001
From: jeremy <jeremy.bailleux@memgraph.io>
Date: Wed, 9 Nov 2022 15:38:51 +0100
Subject: [PATCH 36/38] Force formatting

---
 tests/mgbench/benchmark.py       | 8 ++++----
 tests/mgbench/dataset_creator.py | 5 +++--
 tests/mgbench/helpers.py         | 1 -
 3 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/tests/mgbench/benchmark.py b/tests/mgbench/benchmark.py
index 6f37c9570..9d8423e89 100755
--- a/tests/mgbench/benchmark.py
+++ b/tests/mgbench/benchmark.py
@@ -15,19 +15,19 @@ import argparse
 import collections
 import copy
 import fnmatch
+import importlib
 import inspect
 import json
 import multiprocessing
+import os
 import random
 import sys
+import time
 
 import datasets
-import log
 import helpers
+import log
 import runners
-import importlib
-import time
-import os
 
 
 def get_queries(gen, count):
diff --git a/tests/mgbench/dataset_creator.py b/tests/mgbench/dataset_creator.py
index 72b773593..9ebeb8cd1 100644
--- a/tests/mgbench/dataset_creator.py
+++ b/tests/mgbench/dataset_creator.py
@@ -9,9 +9,10 @@
 # by the Apache License, Version 2.0, included in the file
 # licenses/APL.txt.
 
-import random
-import helpers
 import argparse
+import random
+
+import helpers
 
 # Explaination of datasets:
 #   - empty_only_index: contains index; contains no data
diff --git a/tests/mgbench/helpers.py b/tests/mgbench/helpers.py
index 1a4cd3c3e..b46e51db4 100644
--- a/tests/mgbench/helpers.py
+++ b/tests/mgbench/helpers.py
@@ -14,7 +14,6 @@ import json
 import os
 import subprocess
 
-
 SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__))
 
 

From 968584a8fcf7ed1467c19f2c3ba808967c4c5b5c Mon Sep 17 00:00:00 2001
From: jeremy <jeremy.bailleux@memgraph.io>
Date: Wed, 9 Nov 2022 16:02:25 +0100
Subject: [PATCH 37/38] Add comment force github workflow

---
 tests/mgbench/helpers.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tests/mgbench/helpers.py b/tests/mgbench/helpers.py
index b46e51db4..8a122b9e0 100644
--- a/tests/mgbench/helpers.py
+++ b/tests/mgbench/helpers.py
@@ -118,3 +118,6 @@ class Cache:
     def save_config(self, config):
         with open(self._config, "w") as f:
             json.dump(config.get_data(), f)
+
+
+# Comment to force github workflow

From 6df2db0d1911e16c22241f7cf24b13d780799637 Mon Sep 17 00:00:00 2001
From: jeremy <jeremy.bailleux@memgraph.io>
Date: Wed, 9 Nov 2022 16:02:59 +0100
Subject: [PATCH 38/38] Remove comment force github workflow

---
 tests/mgbench/helpers.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/tests/mgbench/helpers.py b/tests/mgbench/helpers.py
index 8a122b9e0..b46e51db4 100644
--- a/tests/mgbench/helpers.py
+++ b/tests/mgbench/helpers.py
@@ -118,6 +118,3 @@ class Cache:
     def save_config(self, config):
         with open(self._config, "w") as f:
             json.dump(config.get_data(), f)
-
-
-# Comment to force github workflow