From a5dc818e19f314f6e0990fd89913c0f8911cce66 Mon Sep 17 00:00:00 2001 From: jeremy Date: Wed, 12 Oct 2022 16:35:47 +0200 Subject: [PATCH] Add new dataset for mgbench --- tests/mgbench/dataset_creator.py | 94 +++++++++++ tests/mgbench/datasets.py | 272 +++++++++++++++++++++---------- 2 files changed, 277 insertions(+), 89 deletions(-) create mode 100644 tests/mgbench/dataset_creator.py diff --git a/tests/mgbench/dataset_creator.py b/tests/mgbench/dataset_creator.py new file mode 100644 index 000000000..432a44ec7 --- /dev/null +++ b/tests/mgbench/dataset_creator.py @@ -0,0 +1,94 @@ +# Copyright 2021 Memgraph Ltd. +# +# Use of this software is governed by the Business Source License +# included in the file licenses/BSL.txt; by using this file, you agree to be bound by the terms of the Business Source +# License, and you may not use this file except in compliance with the Business Source License. +# +# As of the Change Date specified in that file, in accordance with +# the Business Source License, use of this software will be governed +# by the Apache License, Version 2.0, included in the file +# licenses/APL.txt. + +import random + +import helpers + +# Explaination of datasets: +# - empty_only_index: contains index; contains no data +# - small: contains index; contains data (small dataset) +# +# Datamodel is as follow: +# +# ┌──────────────┐ +# │ Permission │ +# ┌────────────────┐ │ Schema:uuid │ ┌────────────┐ +# │:IS_FOR_IDENTITY├────┤ Index:name ├───┤:IS_FOR_USER│ +# └┬───────────────┘ └──────────────┘ └────────────┤ +# │ │ +# ┌──────▼──────────────┐ ┌──▼───────────┐ +# │ Identity │ │ User │ +# │ Schema:uuid │ │ Schema:uuid │ +# │ Index:platformId │ │ Index:email │ +# │ Index:name │ └──────────────┘ +# └─────────────────────┘ +# +# +# - User: attributes: ["uuid", "name", "platformId"] +# - Permission: attributes: ["uuid", "name"] +# - Identity: attributes: ["uuid", "email"] +# +# Indexes: +# - User: [User(uuid), User(platformId), User(name)] +# - Permission: [Permission(uuid), Permission(name)] +# - Identity: [Identity(uuid), Identity(email)] +# +# Edges: +# - (:Permission)-[:IS_FOR_USER]->(:User) +# - (:Permission)-[:IS_FOR_IDENTITYR]->(:Identity) +# +# Distributed specific: uuid is the schema + +filename = "dataset.cypher" +f = open(filename, "x") + +f.write("MATCH (n) DETACH DELETE n;\n") + +# Create the indexes +f.write("CREATE INDEX ON :User;\n") +f.write("CREATE INDEX ON :Permission;\n") +f.write("CREATE INDEX ON :Identity;\n") +f.write("CREATE INDEX ON :User(platformId);\n") +f.write("CREATE INDEX ON :User(name);\n") +f.write("CREATE INDEX ON :Permission(name);\n") +f.write("CREATE INDEX ON :Identity(email);\n") + +# Create extra index: in distributed, this will be the schema +f.write("CREATE INDEX ON :User(uuid);\n") +f.write("CREATE INDEX ON :Permission(uuid);\n") +f.write("CREATE INDEX ON :Identity(uuid);\n") + +platform_ids = [f"somePlatformId_{id}" for id in range(10)] + +# This is the number of clusters to change if you want a bigger dataset +number_of_clusters = 3000000 + +for index in range(1, number_of_clusters + 1): + platform_id = platform_ids[random.randint(0, len(platform_ids) - 1)] + user_uuid = index + platform_uuid = number_of_clusters + index + identity_uuid = 2 * number_of_clusters + index + + # Create the nodes + f.write(f'CREATE (:User {{uuid: {user_uuid}, platformId: "{platform_id}", name: "name_user_{user_uuid}"}});\n') + f.write(f'CREATE (:Permission {{uuid: {platform_uuid}, name: "name_permission_{platform_uuid}"}});\n') + f.write(f'CREATE (:Permission {{uuid: {identity_uuid}, name: "mail_{identity_uuid}@something.com"}});\n') + + # Create the edges + f.write( + f"MATCH (permission:Permission {{uuid: {platform_uuid}}}), (user:User {{uuid: {user_uuid}}}) CREATE (permission)-[e: IS_FOR_USER]->(user);\n" + ) + f.write( + f"MATCH (permission:Permission {{uuid: {platform_uuid}}}), (identity:Identity {{uuid: {identity_uuid}}}) CREATE (permission)-[e: IS_FOR_IDENTITY]->(identity);\n" + ) + +f.close() diff --git a/tests/mgbench/datasets.py b/tests/mgbench/datasets.py index dbaaa2de9..45fdf67db 100644 --- a/tests/mgbench/datasets.py +++ b/tests/mgbench/datasets.py @@ -45,13 +45,10 @@ class Dataset: variant = self.DEFAULT_VARIANT if variant not in self.VARIANTS: raise ValueError("Invalid test variant!") - if (self.FILES and variant not in self.FILES) and \ - (self.URLS and variant not in self.URLS): - raise ValueError("The variant doesn't have a defined URL or " - "file path!") + if (self.FILES and variant not in self.FILES) and (self.URLS and variant not in self.URLS): + raise ValueError("The variant doesn't have a defined URL or " "file path!") if variant not in self.SIZES: - raise ValueError("The variant doesn't have a defined dataset " - "size!") + raise ValueError("The variant doesn't have a defined dataset " "size!") self._variant = variant if self.FILES is not None: self._file = self.FILES.get(variant, None) @@ -63,8 +60,7 @@ class Dataset: self._url = None self._size = self.SIZES[variant] if "vertices" not in self._size or "edges" not in self._size: - raise ValueError("The size defined for this variant doesn't " - "have the number of vertices and/or edges!") + raise ValueError("The size defined for this variant doesn't " "have the number of vertices and/or edges!") self._num_vertices = self._size["vertices"] self._num_edges = self._size["edges"] @@ -76,8 +72,7 @@ class Dataset: cached_input, exists = directory.get_file("dataset.cypher") if not exists: print("Downloading dataset file:", self._url) - downloaded_file = helpers.download_file( - self._url, directory.get_path()) + downloaded_file = helpers.download_file(self._url, directory.get_path()) print("Unpacking and caching file:", downloaded_file) helpers.unpack_and_move_file(downloaded_file, cached_input) print("Using cached dataset file:", cached_input) @@ -137,18 +132,17 @@ class Pokec(Dataset): # Arango benchmarks def benchmark__arango__single_vertex_read(self): - return ("MATCH (n:User {id : $id}) RETURN n", - {"id": self._get_random_vertex()}) + return ("MATCH (n:User {id : $id}) RETURN n", {"id": self._get_random_vertex()}) def benchmark__arango__single_vertex_write(self): - return ("CREATE (n:UserTemp {id : $id}) RETURN n", - {"id": random.randint(1, self._num_vertices * 10)}) + return ("CREATE (n:UserTemp {id : $id}) RETURN n", {"id": random.randint(1, self._num_vertices * 10)}) def benchmark__arango__single_edge_write(self): vertex_from, vertex_to = self._get_random_from_to() - return ("MATCH (n:User {id: $from}), (m:User {id: $to}) WITH n, m " - "CREATE (n)-[e:Temp]->(m) RETURN e", - {"from": vertex_from, "to": vertex_to}) + return ( + "MATCH (n:User {id: $from}), (m:User {id: $to}) WITH n, m " "CREATE (n)-[e:Temp]->(m) RETURN e", + {"from": vertex_from, "to": vertex_to}, + ) def benchmark__arango__aggregate(self): return ("MATCH (n:User) RETURN n.age, COUNT(*)", {}) @@ -157,92 +151,94 @@ class Pokec(Dataset): return ("MATCH (n:User) WHERE n.age >= 18 RETURN n.age, COUNT(*)", {}) def benchmark__arango__expansion_1(self): - return ("MATCH (s:User {id: $id})-->(n:User) " - "RETURN n.id", - {"id": self._get_random_vertex()}) + return ("MATCH (s:User {id: $id})-->(n:User) " "RETURN n.id", {"id": self._get_random_vertex()}) def benchmark__arango__expansion_1_with_filter(self): - return ("MATCH (s:User {id: $id})-->(n:User) " - "WHERE n.age >= 18 " - "RETURN n.id", - {"id": self._get_random_vertex()}) + return ( + "MATCH (s:User {id: $id})-->(n:User) " "WHERE n.age >= 18 " "RETURN n.id", + {"id": self._get_random_vertex()}, + ) def benchmark__arango__expansion_2(self): - return ("MATCH (s:User {id: $id})-->()-->(n:User) " - "RETURN DISTINCT n.id", - {"id": self._get_random_vertex()}) + return ("MATCH (s:User {id: $id})-->()-->(n:User) " "RETURN DISTINCT n.id", {"id": self._get_random_vertex()}) def benchmark__arango__expansion_2_with_filter(self): - return ("MATCH (s:User {id: $id})-->()-->(n:User) " - "WHERE n.age >= 18 " - "RETURN DISTINCT n.id", - {"id": self._get_random_vertex()}) + return ( + "MATCH (s:User {id: $id})-->()-->(n:User) " "WHERE n.age >= 18 " "RETURN DISTINCT n.id", + {"id": self._get_random_vertex()}, + ) def benchmark__arango__expansion_3(self): - return ("MATCH (s:User {id: $id})-->()-->()-->(n:User) " - "RETURN DISTINCT n.id", - {"id": self._get_random_vertex()}) + return ( + "MATCH (s:User {id: $id})-->()-->()-->(n:User) " "RETURN DISTINCT n.id", + {"id": self._get_random_vertex()}, + ) def benchmark__arango__expansion_3_with_filter(self): - return ("MATCH (s:User {id: $id})-->()-->()-->(n:User) " - "WHERE n.age >= 18 " - "RETURN DISTINCT n.id", - {"id": self._get_random_vertex()}) + return ( + "MATCH (s:User {id: $id})-->()-->()-->(n:User) " "WHERE n.age >= 18 " "RETURN DISTINCT n.id", + {"id": self._get_random_vertex()}, + ) def benchmark__arango__expansion_4(self): - return ("MATCH (s:User {id: $id})-->()-->()-->()-->(n:User) " - "RETURN DISTINCT n.id", - {"id": self._get_random_vertex()}) + return ( + "MATCH (s:User {id: $id})-->()-->()-->()-->(n:User) " "RETURN DISTINCT n.id", + {"id": self._get_random_vertex()}, + ) def benchmark__arango__expansion_4_with_filter(self): - return ("MATCH (s:User {id: $id})-->()-->()-->()-->(n:User) " - "WHERE n.age >= 18 " - "RETURN DISTINCT n.id", - {"id": self._get_random_vertex()}) + return ( + "MATCH (s:User {id: $id})-->()-->()-->()-->(n:User) " "WHERE n.age >= 18 " "RETURN DISTINCT n.id", + {"id": self._get_random_vertex()}, + ) def benchmark__arango__neighbours_2(self): - return ("MATCH (s:User {id: $id})-[*1..2]->(n:User) " - "RETURN DISTINCT n.id", - {"id": self._get_random_vertex()}) + return ("MATCH (s:User {id: $id})-[*1..2]->(n:User) " "RETURN DISTINCT n.id", {"id": self._get_random_vertex()}) def benchmark__arango__neighbours_2_with_filter(self): - return ("MATCH (s:User {id: $id})-[*1..2]->(n:User) " - "WHERE n.age >= 18 " - "RETURN DISTINCT n.id", - {"id": self._get_random_vertex()}) + return ( + "MATCH (s:User {id: $id})-[*1..2]->(n:User) " "WHERE n.age >= 18 " "RETURN DISTINCT n.id", + {"id": self._get_random_vertex()}, + ) def benchmark__arango__neighbours_2_with_data(self): - return ("MATCH (s:User {id: $id})-[*1..2]->(n:User) " - "RETURN DISTINCT n.id, n", - {"id": self._get_random_vertex()}) + return ( + "MATCH (s:User {id: $id})-[*1..2]->(n:User) " "RETURN DISTINCT n.id, n", + {"id": self._get_random_vertex()}, + ) def benchmark__arango__neighbours_2_with_data_and_filter(self): - return ("MATCH (s:User {id: $id})-[*1..2]->(n:User) " - "WHERE n.age >= 18 " - "RETURN DISTINCT n.id, n", - {"id": self._get_random_vertex()}) + return ( + "MATCH (s:User {id: $id})-[*1..2]->(n:User) " "WHERE n.age >= 18 " "RETURN DISTINCT n.id, n", + {"id": self._get_random_vertex()}, + ) def benchmark__arango__shortest_path(self): vertex_from, vertex_to = self._get_random_from_to() - return ("MATCH (n:User {id: $from}), (m:User {id: $to}) WITH n, m " - "MATCH p=(n)-[*bfs..15]->(m) " - "RETURN extract(n in nodes(p) | n.id) AS path", - {"from": vertex_from, "to": vertex_to}) + return ( + "MATCH (n:User {id: $from}), (m:User {id: $to}) WITH n, m " + "MATCH p=(n)-[*bfs..15]->(m) " + "RETURN extract(n in nodes(p) | n.id) AS path", + {"from": vertex_from, "to": vertex_to}, + ) def benchmark__arango__shortest_path_with_filter(self): vertex_from, vertex_to = self._get_random_from_to() - return ("MATCH (n:User {id: $from}), (m:User {id: $to}) WITH n, m " - "MATCH p=(n)-[*bfs..15 (e, n | n.age >= 18)]->(m) " - "RETURN extract(n in nodes(p) | n.id) AS path", - {"from": vertex_from, "to": vertex_to}) + return ( + "MATCH (n:User {id: $from}), (m:User {id: $to}) WITH n, m " + "MATCH p=(n)-[*bfs..15 (e, n | n.age >= 18)]->(m) " + "RETURN extract(n in nodes(p) | n.id) AS path", + {"from": vertex_from, "to": vertex_to}, + ) # Our benchmark queries def benchmark__create__edge(self): vertex_from, vertex_to = self._get_random_from_to() - return ("MATCH (a:User {id: $from}), (b:User {id: $to}) " - "CREATE (a)-[:TempEdge]->(b)", - {"from": vertex_from, "to": vertex_to}) + return ( + "MATCH (a:User {id: $from}), (b:User {id: $to}) " "CREATE (a)-[:TempEdge]->(b)", + {"from": vertex_from, "to": vertex_to}, + ) def benchmark__create__pattern(self): return ("CREATE ()-[:TempEdge]->()", {}) @@ -251,9 +247,12 @@ class Pokec(Dataset): return ("CREATE ()", {}) def benchmark__create__vertex_big(self): - return ("CREATE (:L1:L2:L3:L4:L5:L6:L7 {p1: true, p2: 42, " - "p3: \"Here is some text that is not extremely short\", " - "p4:\"Short text\", p5: 234.434, p6: 11.11, p7: false})", {}) + return ( + "CREATE (:L1:L2:L3:L4:L5:L6:L7 {p1: true, p2: 42, " + 'p3: "Here is some text that is not extremely short", ' + 'p4:"Short text", p5: 234.434, p6: 11.11, p7: false})', + {}, + ) def benchmark__aggregation__count(self): return ("MATCH (n) RETURN count(n), count(n.age)", {}) @@ -262,29 +261,124 @@ class Pokec(Dataset): return ("MATCH (n) RETURN min(n.age), max(n.age), avg(n.age)", {}) def benchmark__match__pattern_cycle(self): - return ("MATCH (n:User {id: $id})-[e1]->(m)-[e2]->(n) " - "RETURN e1, m, e2", - {"id": self._get_random_vertex()}) + return ("MATCH (n:User {id: $id})-[e1]->(m)-[e2]->(n) " "RETURN e1, m, e2", {"id": self._get_random_vertex()}) def benchmark__match__pattern_long(self): - return ("MATCH (n1:User {id: $id})-[e1]->(n2)-[e2]->" - "(n3)-[e3]->(n4)<-[e4]-(n5) " - "RETURN n5 LIMIT 1", - {"id": self._get_random_vertex()}) + return ( + "MATCH (n1:User {id: $id})-[e1]->(n2)-[e2]->" "(n3)-[e3]->(n4)<-[e4]-(n5) " "RETURN n5 LIMIT 1", + {"id": self._get_random_vertex()}, + ) def benchmark__match__pattern_short(self): - return ("MATCH (n:User {id: $id})-[e]->(m) " - "RETURN m LIMIT 1", - {"id": self._get_random_vertex()}) + return ("MATCH (n:User {id: $id})-[e]->(m) " "RETURN m LIMIT 1", {"id": self._get_random_vertex()}) def benchmark__match__vertex_on_label_property(self): - return ("MATCH (n:User) WITH n WHERE n.id = $id RETURN n", - {"id": self._get_random_vertex()}) + return ("MATCH (n:User) WITH n WHERE n.id = $id RETURN n", {"id": self._get_random_vertex()}) def benchmark__match__vertex_on_label_property_index(self): - return ("MATCH (n:User {id: $id}) RETURN n", - {"id": self._get_random_vertex()}) + return ("MATCH (n:User {id: $id}) RETURN n", {"id": self._get_random_vertex()}) def benchmark__match__vertex_on_property(self): - return ("MATCH (n {id: $id}) RETURN n", - {"id": self._get_random_vertex()}) + return ("MATCH (n {id: $id}) RETURN n", {"id": self._get_random_vertex()}) + + +class Distributed(Dataset): + + # Explaination of datasets: + # - empty_only_index: contains index; contains no data + # - small/medium/large: contains index; contains data (respectively small/medium/large dataset) + # + # See dataset_creator.py to understand the datamodel and generate a dataset + + NAME = "distributed" + VARIANTS = ["empty_only_index", "small", "medium", "large"] + DEFAULT_VARIANT = "empty_only_index" + URLS = { + "empty_only_index": "https://s3-eu-west-1.amazonaws.com/deps.memgraph.io/distributed_empty_only_index.setup.cypher.gz", + "small": "https://s3-eu-west-1.amazonaws.com/deps.memgraph.io/distributed_small.setup.cypher.gz", + "medium": "https://s3-eu-west-1.amazonaws.com/deps.memgraph.io/distributed_medium.setup.cypher.gz", + "large": "https://s3-eu-west-1.amazonaws.com/deps.memgraph.io/distributed_large.setup.cypher.gz", + } + SIZES = { + "empty_only_index": { + "vertices": 0, + "edges": -1, # not used + "uuid_ranges": { + "User": {"first_uuid": 0, "last_uuid": 0}, + "Permission": {"first_uuid": 0, "last_uuid": 0}, + "Identity": {"first_uuid": 0, "last_uuid": 0}, + }, + }, + "small": { + "vertices": 30, + "edges": -1, # not used + "uuid_ranges": { + "User": {"first_uuid": 1, "last_uuid": 10}, + "Permission": {"first_uuid": 11, "last_uuid": 20}, + "Identity": {"first_uuid": 21, "last_uuid": 30}, + }, + }, + "medium": { + "vertices": 30000, + "edges": -1, # not used + "uuid_ranges": { + "User": {"first_uuid": 1, "last_uuid": 10000}, + "Permission": {"first_uuid": 10001, "last_uuid": 20000}, + "Identity": {"first_uuid": 10001, "last_uuid": 30000}, + }, + }, + "large": { + "vertices": 3000000, + "edges": -1, # not used + "uuid_ranges": { + "User": {"first_uuid": 1, "last_uuid": 1000000}, + "Permission": {"first_uuid": 100001, "last_uuid": 2000000}, + "Identity": {"first_uuid": 1000001, "last_uuid": 3000000}, + }, + }, + } + + def _get_random_uuid(self, type): + assert type in ["User", "Permission", "Identity"] + + first_uuid = Dataset.get_size(self)["uuid_ranges"][type]["first_uuid"] + last_uuid = Dataset.get_size(self)["uuid_ranges"][type]["last_uuid"] + + random_value = random.randint(first_uuid, last_uuid) + return random_value + + def __init__(self, variant=None): + Dataset.__init__(self, variant) + self.next_value_idx = Dataset.get_size(self)["vertices"] + 1 + + def benchmark__create__vertex(self): + self.next_value_idx += 1 + query = (f"CREATE (:User {{uuid: {self.next_value_idx}}});", {}) + return query + + def benchmark__create__edges(self): + permission_uuid = self._get_random_uuid("Permission") + user_uuid = self._get_random_uuid("User") + + query = ( + "MATCH (permission:Permission {uuid: $permission_uuid}), (user:User {uuid: $user_uuid}) " + "CREATE (permission)-[:IS_FOR_USER]->(user)", + {"permission_uuid": permission_uuid, "user_uuid": user_uuid}, + ) + + return query + + def benchmark__match__match_all_vertices(self): + self.next_value_idx += 1 + query = ("MATCH (n) RETURN *", {}) + return query + + def benchmark__match__match_on_labelled_vertices(self): + self.next_value_idx += 1 + query = ("MATCH (n:User) RETURN *", {}) + return query + + def benchmark__match__match_all_verteices_with_edges(self): + self.next_value_idx += 1 + query = ("MATCH (permission:Permission)-[e:IS_FOR_USER]->(user:User) RETURN *", {}) + return query