diff --git a/tests/mgbench/dataset_creator.py b/tests/mgbench/dataset_creator.py index 432a44ec7..754419ccc 100644 --- a/tests/mgbench/dataset_creator.py +++ b/tests/mgbench/dataset_creator.py @@ -10,8 +10,8 @@ # licenses/APL.txt. import random - import helpers +import argparse # Explaination of datasets: # - empty_only_index: contains index; contains no data @@ -22,73 +22,95 @@ import helpers # ┌──────────────┐ # │ Permission │ # ┌────────────────┐ │ Schema:uuid │ ┌────────────┐ -# │:IS_FOR_IDENTITY├────┤ Index:name ├───┤:IS_FOR_USER│ +# │:IS_FOR_IDENTITY├────┤ Index:name ├───┤:IS_FOR_FILE│ # └┬───────────────┘ └──────────────┘ └────────────┤ # │ │ -# ┌──────▼──────────────┐ ┌──▼───────────┐ -# │ Identity │ │ User │ -# │ Schema:uuid │ │ Schema:uuid │ -# │ Index:platformId │ │ Index:email │ -# │ Index:name │ └──────────────┘ -# └─────────────────────┘ +# ┌──────▼──────────────┐ ┌──▼────────────────┐ +# │ Identity │ │ File │ +# │ Schema:uuid │ │ Schema:uuid │ +# │ Index:email │ │ Index:name │ +# └─────────────────────┘ │ Index:platformId │ +# └───────────────────┘ # -# -# - User: attributes: ["uuid", "name", "platformId"] +# - File: attributes: ["uuid", "name", "platformId"] # - Permission: attributes: ["uuid", "name"] # - Identity: attributes: ["uuid", "email"] # # Indexes: -# - User: [User(uuid), User(platformId), User(name)] +# - File: [File(uuid), File(platformId), File(name)] # - Permission: [Permission(uuid), Permission(name)] # - Identity: [Identity(uuid), Identity(email)] # # Edges: -# - (:Permission)-[:IS_FOR_USER]->(:User) +# - (:Permission)-[:IS_FOR_FILE]->(:File) # - (:Permission)-[:IS_FOR_IDENTITYR]->(:Identity) # -# Distributed specific: uuid is the schema +# AccessControl specific: uuid is the schema -filename = "dataset.cypher" -f = open(filename, "x") +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--number_of_identities", type=int, default=10) + parser.add_argument("--number_of_files", type=int, default=10) + parser.add_argument("--percentage_of_permissions", type=float, default=1.0) + parser.add_argument("--filename", default="dataset.cypher") -f.write("MATCH (n) DETACH DELETE n;\n") + args = parser.parse_args() -# Create the indexes -f.write("CREATE INDEX ON :User;\n") -f.write("CREATE INDEX ON :Permission;\n") -f.write("CREATE INDEX ON :Identity;\n") -f.write("CREATE INDEX ON :User(platformId);\n") -f.write("CREATE INDEX ON :User(name);\n") -f.write("CREATE INDEX ON :Permission(name);\n") -f.write("CREATE INDEX ON :Identity(email);\n") + number_of_identities = args.number_of_identities + number_of_files = args.number_of_files + percentage_of_permissions = args.percentage_of_permissions + filename = args.filename -# Create extra index: in distributed, this will be the schema -f.write("CREATE INDEX ON :User(uuid);\n") -f.write("CREATE INDEX ON :Permission(uuid);\n") -f.write("CREATE INDEX ON :Identity(uuid);\n") + assert number_of_identities > 0 + assert number_of_files > 0 + assert percentage_of_permissions > 0.0 and percentage_of_permissions <= 1.0 + assert filename != "" -platform_ids = [f"somePlatformId_{id}" for id in range(10)] + f = open(filename, "w") -# This is the number of clusters to change if you want a bigger dataset -number_of_clusters = 3000000 + f.write("MATCH (n) DETACH DELETE n;\n") -for index in range(1, number_of_clusters + 1): - platform_id = platform_ids[random.randint(0, len(platform_ids) - 1)] - user_uuid = index - platform_uuid = number_of_clusters + index - identity_uuid = 2 * number_of_clusters + index + # Create the indexes + f.write("CREATE INDEX ON :File;\n") + f.write("CREATE INDEX ON :Permission;\n") + f.write("CREATE INDEX ON :Identity;\n") + f.write("CREATE INDEX ON :File(platformId);\n") + f.write("CREATE INDEX ON :File(name);\n") + f.write("CREATE INDEX ON :Permission(name);\n") + f.write("CREATE INDEX ON :Identity(email);\n") - # Create the nodes - f.write(f'CREATE (:User {{uuid: {user_uuid}, platformId: "{platform_id}", name: "name_user_{user_uuid}"}});\n') - f.write(f'CREATE (:Permission {{uuid: {platform_uuid}, name: "name_permission_{platform_uuid}"}});\n') - f.write(f'CREATE (:Permission {{uuid: {identity_uuid}, name: "mail_{identity_uuid}@something.com"}});\n') + # Create extra index: in distributed, this will be the schema + f.write("CREATE INDEX ON :File(uuid);\n") + f.write("CREATE INDEX ON :Permission(uuid);\n") + f.write("CREATE INDEX ON :Identity(uuid);\n") - # Create the edges - f.write( - f"MATCH (permission:Permission {{uuid: {platform_uuid}}}), (user:User {{uuid: {user_uuid}}}) CREATE (permission)-[e: IS_FOR_USER]->(user);\n" - ) - f.write( - f"MATCH (permission:Permission {{uuid: {platform_uuid}}}), (identity:Identity {{uuid: {identity_uuid}}}) CREATE (permission)-[e: IS_FOR_IDENTITY]->(identity);\n" - ) + uuid = 1 -f.close() + files = [] + # Create the nodes File + for index in range(0, number_of_files): + f.write(f'CREATE (:File {{uuid: {uuid}, platformId: platform_id, name: "name_file_{uuid}"}});\n') + uuid += 1 + + identities = [] + # Create the nodes Identity + for index in range(0, number_of_identities): + f.write(f'CREATE (:Identity {{uuid: {uuid}, name: "mail_{uuid}@something.com"}});\n') + uuid += 1 + + for outer_index in range(0, number_of_files): + for inner_index in range(0, number_of_identities): + file_uuid = outer_index + identity_uuid = number_of_files + inner_index + + if random.random() <= percentage_of_permissions: + f.write(f'CREATE (:Permission {{uuid: {uuid}, name: "name_permission_{uuid}"}});\n') + f.write( + f"MATCH (permission:Permission {{uuid: {uuid}}}), (file:File {{uuid: {file_uuid}}}) CREATE (permission)-[e: IS_FOR_FILE]->(file);\n" + ) + f.write( + f"MATCH (permission:Permission {{uuid: {uuid}}}), (identity:Identity {{uuid: {identity_uuid}}}) CREATE (permission)-[e: IS_FOR_IDENTITY]->(identity);\n" + ) + uuid += 1 + + f.close() diff --git a/tests/mgbench/datasets.py b/tests/mgbench/datasets.py index 45fdf67db..4c5f19043 100644 --- a/tests/mgbench/datasets.py +++ b/tests/mgbench/datasets.py @@ -282,7 +282,7 @@ class Pokec(Dataset): return ("MATCH (n {id: $id}) RETURN n", {"id": self._get_random_vertex()}) -class Distributed(Dataset): +class AccessControl(Dataset): # Explaination of datasets: # - empty_only_index: contains index; contains no data @@ -290,21 +290,21 @@ class Distributed(Dataset): # # See dataset_creator.py to understand the datamodel and generate a dataset - NAME = "distributed" + NAME = "accesscontrol" VARIANTS = ["empty_only_index", "small", "medium", "large"] DEFAULT_VARIANT = "empty_only_index" URLS = { - "empty_only_index": "https://s3-eu-west-1.amazonaws.com/deps.memgraph.io/distributed_empty_only_index.setup.cypher.gz", - "small": "https://s3-eu-west-1.amazonaws.com/deps.memgraph.io/distributed_small.setup.cypher.gz", - "medium": "https://s3-eu-west-1.amazonaws.com/deps.memgraph.io/distributed_medium.setup.cypher.gz", - "large": "https://s3-eu-west-1.amazonaws.com/deps.memgraph.io/distributed_large.setup.cypher.gz", + "empty_only_index": "https://s3-eu-west-1.amazonaws.com/deps.memgraph.io/accesscontrol_empty_only_index.setup.cypher.gz", + "small": "https://s3-eu-west-1.amazonaws.com/deps.memgraph.io/accesscontrol_small.setup.cypher.gz", + "medium": "https://s3-eu-west-1.amazonaws.com/deps.memgraph.io/accesscontrol_medium.setup.cypher.gz", + "large": "https://s3-eu-west-1.amazonaws.com/deps.memgraph.io/accesscontrol_large.setup.cypher.gz", } SIZES = { "empty_only_index": { "vertices": 0, "edges": -1, # not used "uuid_ranges": { - "User": {"first_uuid": 0, "last_uuid": 0}, + "File": {"first_uuid": 0, "last_uuid": 0}, "Permission": {"first_uuid": 0, "last_uuid": 0}, "Identity": {"first_uuid": 0, "last_uuid": 0}, }, @@ -313,7 +313,7 @@ class Distributed(Dataset): "vertices": 30, "edges": -1, # not used "uuid_ranges": { - "User": {"first_uuid": 1, "last_uuid": 10}, + "File": {"first_uuid": 1, "last_uuid": 10}, "Permission": {"first_uuid": 11, "last_uuid": 20}, "Identity": {"first_uuid": 21, "last_uuid": 30}, }, @@ -322,7 +322,7 @@ class Distributed(Dataset): "vertices": 30000, "edges": -1, # not used "uuid_ranges": { - "User": {"first_uuid": 1, "last_uuid": 10000}, + "File": {"first_uuid": 1, "last_uuid": 10000}, "Permission": {"first_uuid": 10001, "last_uuid": 20000}, "Identity": {"first_uuid": 10001, "last_uuid": 30000}, }, @@ -331,7 +331,7 @@ class Distributed(Dataset): "vertices": 3000000, "edges": -1, # not used "uuid_ranges": { - "User": {"first_uuid": 1, "last_uuid": 1000000}, + "File": {"first_uuid": 1, "last_uuid": 1000000}, "Permission": {"first_uuid": 100001, "last_uuid": 2000000}, "Identity": {"first_uuid": 1000001, "last_uuid": 3000000}, }, @@ -339,7 +339,7 @@ class Distributed(Dataset): } def _get_random_uuid(self, type): - assert type in ["User", "Permission", "Identity"] + assert type in ["File", "Permission", "Identity"] first_uuid = Dataset.get_size(self)["uuid_ranges"][type]["first_uuid"] last_uuid = Dataset.get_size(self)["uuid_ranges"][type]["last_uuid"] @@ -353,17 +353,17 @@ class Distributed(Dataset): def benchmark__create__vertex(self): self.next_value_idx += 1 - query = (f"CREATE (:User {{uuid: {self.next_value_idx}}});", {}) + query = (f"CREATE (:File {{uuid: {self.next_value_idx}}});", {}) return query def benchmark__create__edges(self): permission_uuid = self._get_random_uuid("Permission") - user_uuid = self._get_random_uuid("User") + file_uuid = self._get_random_uuid("File") query = ( - "MATCH (permission:Permission {uuid: $permission_uuid}), (user:User {uuid: $user_uuid}) " - "CREATE (permission)-[:IS_FOR_USER]->(user)", - {"permission_uuid": permission_uuid, "user_uuid": user_uuid}, + "MATCH (permission:Permission {uuid: $permission_uuid}), (file:File {uuid: $file_uuid}) " + "CREATE (permission)-[:IS_FOR_FILE]->(file)", + {"permission_uuid": permission_uuid, "file_uuid": file_uuid}, ) return query @@ -375,10 +375,10 @@ class Distributed(Dataset): def benchmark__match__match_on_labelled_vertices(self): self.next_value_idx += 1 - query = ("MATCH (n:User) RETURN *", {}) + query = ("MATCH (n:File) RETURN *", {}) return query def benchmark__match__match_all_verteices_with_edges(self): self.next_value_idx += 1 - query = ("MATCH (permission:Permission)-[e:IS_FOR_USER]->(user:User) RETURN *", {}) + query = ("MATCH (permission:Permission)-[e:IS_FOR_FILE]->(file:File) RETURN *", {}) return query