memgraph/tests/mgbench/workloads/ldbc_bi.py

# Copyright 2023 Memgraph Ltd.
#
# Use of this software is governed by the Business Source License
# included in the file licenses/BSL.txt; by using this file, you agree to be bound by the terms of the Business Source
# License, and you may not use this file except in compliance with the Business Source License.
#
# As of the Change Date specified in that file, in accordance with
# the Business Source License, use of this software will be governed
# by the Apache License, Version 2.0, included in the file
# licenses/APL.txt.

# ---  DISCLAIMER: This is NOT an official implementation of an LDBC Benchmark.  ---
import inspect
import random
from pathlib import Path

import helpers
from benchmark_context import BenchmarkContext
from workloads.base import Workload
from workloads.importers.importer_ldbc_bi import ImporterLDBCBI


class LDBC_BI(Workload):
    NAME = "ldbc_bi"
    VARIANTS = ["sf1", "sf3", "sf10"]
    DEFAULT_VARIANT = "sf1"

    URL_FILE = {
        "sf1": "https://s3.eu-west-1.amazonaws.com/deps.memgraph.io/dataset/ldbc/benchmark/bi/ldbc_bi_sf1.cypher.gz",
        "sf3": "https://s3.eu-west-1.amazonaws.com/deps.memgraph.io/dataset/ldbc/benchmark/bi/ldbc_bi_sf3.cypher.gz",
        "sf10": "https://s3.eu-west-1.amazonaws.com/deps.memgraph.io/dataset/ldbc/benchmark/bi/ldbc_bi_sf10.cypher.gz",
    }

    URL_CSV = {
        "sf1": "https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/bi-pre-audit/bi-sf1-composite-projected-fk.tar.zst",
        "sf3": "https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/bi-pre-audit/bi-sf3-composite-projected-fk.tar.zst",
        "sf10": "https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/bi-pre-audit/bi-sf10-composite-projected-fk.tar.zst",
    }

    SIZES = {
        "sf1": {"vertices": 2997352, "edges": 17196776},
        "sf3": {"vertices": 1, "edges": 1},
        "sf10": {"vertices": 1, "edges": 1},
    }

    LOCAL_INDEX_FILES = None

    URL_INDEX_FILE = {
        "memgraph": "https://s3.eu-west-1.amazonaws.com/deps.memgraph.io/dataset/ldbc/benchmark/bi/memgraph_bi_index.cypher",
        "neo4j": "https://s3.eu-west-1.amazonaws.com/deps.memgraph.io/dataset/ldbc/benchmark/bi/neo4j_bi_index.cypher",
    }

    QUERY_PARAMETERS = {
        "sf1": "https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/bi-pre-audit/parameters-2022-10-01.zip",
        "sf3": "https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/bi-pre-audit/parameters-2022-10-01.zip",
        "sf10": "https://pub-383410a98aef4cb686f0c7601eddd25f.r2.dev/bi-pre-audit/parameters-2022-10-01.zip",
    }

    def custom_import(self) -> bool:
        importer = ImporterLDBCBI(
            benchmark_context=self.benchmark_context,
            dataset_name=self.NAME,
            variant=self._variant,
            index_file=self._file_index,
            csv_dict=self.URL_CSV,
        )
        return importer.execute_import()

    def _prepare_parameters_directory(self):
        parameters = Path() / ".cache" / "datasets" / self.NAME / self._variant / "parameters"
        parameters.mkdir(parents=True, exist_ok=True)
        if parameters.exists() and any(parameters.iterdir()):
            print("Files downloaded.")
        else:
            print("Downloading files")
            downloaded_file = helpers.download_file(self.QUERY_PARAMETERS[self._variant], parameters.parent.absolute())
            print("Unpacking the file..." + downloaded_file)
            helpers.unpack_zip(Path(downloaded_file))
        return parameters / ("parameters-" + self._variant)

    def _get_query_parameters(self) -> dict:
        func_name = inspect.stack()[1].function
        parameters = {}
        for file in self._parameters_dir.glob("bi-*.csv"):
            file_name_query_id = file.name.split("-")[1][0:-4]
            func_name_id = func_name.split("_")[-2]
            if file_name_query_id == func_name_id or file_name_query_id == func_name_id + "a":
                with file.open("r") as input:
                    lines = input.readlines()
                    header = lines[0].strip("\n").split("|")
                    position = random.randint(1, len(lines) - 1)
                    data = lines[position].strip("\n").split("|")
                    for i in range(len(header)):
                        key, value_type = header[i].split(":")
                        if value_type == "DATETIME":
                            # Drop time zone
                            converted = data[i][0:-6]
                            parameters[key] = converted
                        elif value_type == "DATE":
                            converted = data[i] + "T00:00:00"
                            parameters[key] = converted
                        elif value_type == "INT":
                            parameters[key] = int(data[i])
                        elif value_type == "STRING[]":
                            elements = data[i].split(";")
                            parameters[key] = elements
                        else:
                            parameters[key] = data[i]
                break

        return parameters

    def __init__(self, variant=None, benchmark_context: BenchmarkContext = None):
        super().__init__(variant, benchmark_context=benchmark_context)
        self._parameters_dir = self._prepare_parameters_directory()

    def benchmark__bi__query_1_analytical(self):
        memgraph = (
            """
            MATCH (message:Message)
            WHERE message.creationDate < localDateTime($datetime)
            WITH count(message) AS totalMessageCountInt
            WITH toFloat(totalMessageCountInt) AS totalMessageCount
            MATCH (message:Message)
            WHERE message.creationDate < localDateTime($datetime)
            AND message.content IS NOT NULL
            WITH
                totalMessageCount,
                message,
                message.creationDate.year AS year
            WITH
                totalMessageCount,
                year,
                message:Comment AS isComment,
                CASE
                    WHEN message.length <  40 THEN 0
                    WHEN message.length <  80 THEN 1
                    WHEN message.length < 160 THEN 2
                    ELSE                           3
                END AS lengthCategory,
                count(message) AS messageCount,
                sum(message.length) / toFloat(count(message)) AS averageMessageLength,
                sum(message.length) AS sumMessageLength
            RETURN
                year,
                isComment,
                lengthCategory,
                messageCount,
                averageMessageLength,
                sumMessageLength,
                messageCount / totalMessageCount AS percentageOfMessages
            ORDER BY
                year DESC,
                isComment ASC,
                lengthCategory ASC
            """.replace(
                "\n", ""
            ),
            self._get_query_parameters(),
        )

        neo4j = (
            """
            MATCH (message:Message)
            WHERE message.creationDate < DateTime($datetime)
            WITH count(message) AS totalMessageCountInt
            WITH toFloat(totalMessageCountInt) AS totalMessageCount
            MATCH (message:Message)
            WHERE message.creationDate < DateTime($datetime)
            AND message.content IS NOT NULL
            WITH
                totalMessageCount,
                message,
                message.creationDate.year AS year
            WITH
                totalMessageCount,
                year,
                message:Comment AS isComment,
                CASE
                    WHEN message.length <  40 THEN 0
                    WHEN message.length <  80 THEN 1
                    WHEN message.length < 160 THEN 2
                    ELSE                           3
                END AS lengthCategory,
                count(message) AS messageCount,
                sum(message.length) / toFloat(count(message)) AS averageMessageLength,
                sum(message.length) AS sumMessageLength
            RETURN
                year,
                isComment,
                lengthCategory,
                messageCount,
                averageMessageLength,
                sumMessageLength,
                messageCount / totalMessageCount AS percentageOfMessages
            ORDER BY
                year DESC,
                isComment ASC,
                lengthCategory ASC
            """.replace(
                "\n", ""
            ),
            self._get_query_parameters(),
        )
        if self._vendor == "memgraph":
            return memgraph
        else:
            return neo4j

    def benchmark__bi__query_2_analytical(self):
        memgraph = (
            """
            MATCH (tag:Tag)-[:HAS_TYPE]->(:TagClass {name: $tagClass})
            OPTIONAL MATCH (message1:Message)-[:HAS_TAG]->(tag)
            WHERE localDateTime($date) <= message1.creationDate
                AND message1.creationDate < localDateTime($date) + duration({day: 100})
            WITH tag, count(message1) AS countWindow1
            OPTIONAL MATCH (message2:Message)-[:HAS_TAG]->(tag)
            WHERE localDateTime($date) + duration({day: 100}) <= message2.creationDate
                AND message2.creationDate < localDateTime($date) + duration({day: 200})
            WITH
                tag,
                countWindow1,
                count(message2) AS countWindow2
            RETURN
                tag.name,
                countWindow1,
                countWindow2,
                abs(countWindow1 - countWindow2) AS diff
            ORDER BY
                diff DESC,
                tag.name ASC
            LIMIT 100
            """.replace(
                "\n", ""
            ),
            self._get_query_parameters(),
        )

        neo4j = (
            """
            MATCH (tag:Tag)-[:HAS_TYPE]->(:TagClass {name: $tagClass})
            OPTIONAL MATCH (message1:Message)-[:HAS_TAG]->(tag)
            WHERE DateTime($date) <= message1.creationDate
                AND message1.creationDate < DateTime($date) + duration({days: 100})
            WITH tag, count(message1) AS countWindow1
            OPTIONAL MATCH (message2:Message)-[:HAS_TAG]->(tag)
            WHERE DateTime($date) + duration({days: 100}) <= message2.creationDate
                AND message2.creationDate < DateTime($date) + duration({days: 200})
            WITH
                tag,
                countWindow1,
                count(message2) AS countWindow2
            RETURN
                tag.name,
                countWindow1,
                countWindow2,
                abs(countWindow1 - countWindow2) AS diff
            ORDER BY
                diff DESC,
                tag.name ASC
            LIMIT 100
            """.replace(
                "\n", ""
            ),
            self._get_query_parameters(),
        )
        if self._vendor == "memgraph":
            return memgraph
        else:
            return neo4j

    def benchmark__bi__query_3_analytical(self):
        return (
            """
            MATCH
                (:Country {name: $country})<-[:IS_PART_OF]-(:City)<-[:IS_LOCATED_IN]-
                (person:Person)<-[:HAS_MODERATOR]-(forum:Forum)-[:CONTAINER_OF]->
                (post:Post)<-[:REPLY_OF*0..]-(message:Message)-[:HAS_TAG]->(:Tag)-[:HAS_TYPE]->(:TagClass {name: $tagClass})
            RETURN
                forum.id as id,
                forum.title,
                person.id,
                count(DISTINCT message) AS messageCount
            ORDER BY
                messageCount DESC,
                id ASC
            LIMIT 20
            """.replace(
                "\n", ""
            ),
            self._get_query_parameters(),
        )

    def benchmark__bi__query_5_analytical(self):
        return (
            """
            MATCH (tag:Tag {name: $tag})<-[:HAS_TAG]-(message:Message)-[:HAS_CREATOR]->(person:Person)
            OPTIONAL MATCH (message)<-[likes:LIKES]-(:Person)
            WITH person, message, count(likes) AS likeCount
            OPTIONAL MATCH (message)<-[:REPLY_OF]-(reply:Comment)
            WITH person, message, likeCount, count(reply) AS replyCount
            WITH person, count(message) AS messageCount, sum(likeCount) AS likeCount, sum(replyCount) AS replyCount
            RETURN
                person.id,
                replyCount,
                likeCount,
                messageCount,
                1*messageCount + 2*replyCount + 10*likeCount AS score
            ORDER BY
                score DESC,
                person.id ASC
            LIMIT 100
            """.replace(
                "\n", ""
            ),
            self._get_query_parameters(),
        )

    def benchmark__bi__query_6_analytical(self):
        return (
            """
            MATCH (tag:Tag {name: $tag})<-[:HAS_TAG]-(message1:Message)-[:HAS_CREATOR]->(person1:Person)
            OPTIONAL MATCH (message1)<-[:LIKES]-(person2:Person)
            OPTIONAL MATCH (person2)<-[:HAS_CREATOR]-(message2:Message)<-[like:LIKES]-(person3:Person)
            RETURN
                person1.id as id,
                count(DISTINCT like) AS authorityScore
            ORDER BY
                authorityScore DESC,
                id ASC
            LIMIT 100
            """.replace(
                "\n", ""
            ),
            self._get_query_parameters(),
        )

    def benchmark__bi__query_7_analytical(self):
        memgraph = (
            """
            MATCH
                (tag:Tag {name: $tag})<-[:HAS_TAG]-(message:Message),
                (message)<-[:REPLY_OF]-(comment:Comment)-[:HAS_TAG]->(relatedTag:Tag)
            OPTIONAL MATCH (comment)-[:HAS_TAG]->(tag)
            WHERE tag IS NOT NULL
            RETURN
                relatedTag,
                count(DISTINCT comment) AS count
            ORDER BY
                relatedTag.name ASC,
                count DESC
            LIMIT 100
            """.replace(
                "\n", ""
            ),
            self._get_query_parameters(),
        )

        neo4j = (
            """
            MATCH
                (tag:Tag {name: $tag})<-[:HAS_TAG]-(message:Message),
                (message)<-[:REPLY_OF]-(comment:Comment)-[:HAS_TAG]->(relatedTag:Tag)
            WHERE NOT (comment)-[:HAS_TAG]->(tag)
            RETURN
                relatedTag.name,
                count(DISTINCT comment) AS count
            ORDER BY
                relatedTag.name ASC,
                count DESC
            LIMIT 100
            """.replace(
                "\n", ""
            ),
            self._get_query_parameters(),
        )
        if self._vendor == "memgraph":
            return memgraph
        else:
            return neo4j

    def benchmark__bi__query_9_analytical(self):
        memgraph = (
            """
            MATCH (person:Person)<-[:HAS_CREATOR]-(post:Post)<-[:REPLY_OF*0..]-(reply:Message)
            WHERE  post.creationDate >= localDateTime($startDate)
                AND  post.creationDate <= localDateTime($endDate)
                AND reply.creationDate >= localDateTime($startDate)
                AND reply.creationDate <= localDateTime($endDate)
            RETURN
                person.id as id,
                person.firstName,
                person.lastName,
                count(DISTINCT post) AS threadCount,
                count(DISTINCT reply) AS messageCount
            ORDER BY
                messageCount DESC,
                id ASC
            LIMIT 100
            """.replace(
                "\n", ""
            ),
            self._get_query_parameters(),
        )
        neo4j = (
            """
            MATCH (person:Person)<-[:HAS_CREATOR]-(post:Post)<-[:REPLY_OF*0..]-(reply:Message)
            WHERE  post.creationDate >= DateTime($startDate)
                AND  post.creationDate <= DateTime($endDate)
                AND reply.creationDate >= DateTime($startDate)
                AND reply.creationDate <= DateTime($endDate)
            RETURN
                person.id as id,
                person.firstName,
                person.lastName,
                count(DISTINCT post) AS threadCount,
                count(DISTINCT reply) AS messageCount
            ORDER BY
                messageCount DESC,
                id ASC
            LIMIT 100
            """.replace(
                "\n", ""
            ),
            self._get_query_parameters(),
        )
        if self._vendor == "memgraph":
            return memgraph
        else:
            return neo4j

    def benchmark__bi__query_11_analytical(self):
        return (
            """
            MATCH (a:Person)-[:IS_LOCATED_IN]->(:City)-[:IS_PART_OF]->(country:Country {name: $country}),
                (a)-[k1:KNOWS]-(b:Person)
            WHERE a.id < b.id
                AND localDateTime($startDate) <= k1.creationDate AND k1.creationDate <= localDateTime($endDate)
            WITH DISTINCT country, a, b
            MATCH (b)-[:IS_LOCATED_IN]->(:City)-[:IS_PART_OF]->(country)
            WITH DISTINCT country, a, b
            MATCH (b)-[k2:KNOWS]-(c:Person),
                (c)-[:IS_LOCATED_IN]->(:City)-[:IS_PART_OF]->(country)
            WHERE b.id < c.id
                AND localDateTime($startDate) <= k2.creationDate AND k2.creationDate <= localDateTime($endDate)
            WITH DISTINCT a, b, c
            MATCH (c)-[k3:KNOWS]-(a)
            WHERE localDateTime($startDate) <= k3.creationDate AND k3.creationDate <= localDateTime($endDate)
            WITH DISTINCT a, b, c
            RETURN count(*) AS count
            """.replace(
                "\n", ""
            ),
            self._get_query_parameters(),
        )

    def benchmark__bi__query_12_analytical(self):
        return (
            """
            MATCH (person:Person)
            OPTIONAL MATCH (person)<-[:HAS_CREATOR]-(message:Message)-[:REPLY_OF*0..]->(post:Post)
            WHERE message.content IS NOT NULL
                AND message.length < $lengthThreshold
                AND message.creationDate > localDateTime($startDate)
                AND post.language IN $languages
            WITH
                person,
                count(message) AS messageCount
            RETURN
                messageCount,
                count(person) AS personCount
            ORDER BY
                personCount DESC,
                messageCount DESC
            """.replace(
                "\n", ""
            ),
            self._get_query_parameters(),
        )

    def benchmark__bi__query_13_analytical(self):
        memgraph = (
            """
            MATCH (country:Country {name: $country})<-[:IS_PART_OF]-(:City)<-[:IS_LOCATED_IN]-(zombie:Person)
            WHERE zombie.creationDate < localDateTime($endDate)
            WITH country, zombie
            OPTIONAL MATCH (zombie)<-[:HAS_CREATOR]-(message:Message)
            WHERE message.creationDate < localDateTime($endDate)
            WITH
                country,
                zombie,
                count(message) AS messageCount
            WITH
                country,
                zombie,
                12 * (localDateTime($endDate).year  - zombie.creationDate.year )
                    + (localDateTime($endDate).month - zombie.creationDate.month)
                    + 1 AS months,
                messageCount
            WHERE messageCount / months < 1
            WITH
                country,
                collect(zombie) AS zombies
            UNWIND zombies AS zombie
            OPTIONAL MATCH
                (zombie)<-[:HAS_CREATOR]-(message:Message)<-[:LIKES]-(likerZombie:Person)
            WHERE likerZombie IN zombies
            WITH
                zombie,
                count(likerZombie) AS zombieLikeCount
            OPTIONAL MATCH
                (zombie)<-[:HAS_CREATOR]-(message:Message)<-[:LIKES]-(likerPerson:Person)
            WHERE likerPerson.creationDate < localDateTime($endDate)
            WITH
                zombie,
                zombieLikeCount,
                count(likerPerson) AS totalLikeCount
            RETURN
                zombie.id,
                zombieLikeCount,
                totalLikeCount,
            CASE totalLikeCount
            WHEN 0 THEN 0.0
            ELSE zombieLikeCount / toFloat(totalLikeCount)
            END AS zombieScore
            ORDER BY
                zombieScore DESC,
                zombie.id ASC
            LIMIT 100
            """.replace(
                "\n", ""
            ),
            self._get_query_parameters(),
        )

        neo4j = (
            """
            MATCH (country:Country {name: $country})<-[:IS_PART_OF]-(:City)<-[:IS_LOCATED_IN]-(zombie:Person)
            WHERE zombie.creationDate < DateTime($endDate)
            WITH country, zombie
            OPTIONAL MATCH (zombie)<-[:HAS_CREATOR]-(message:Message)
            WHERE message.creationDate < DateTime($endDate)
            WITH
                country,
                zombie,
                count(message) AS messageCount
            WITH
                country,
                zombie,
                12 * (DateTime($endDate).year  - zombie.creationDate.year )
                    + (DateTime($endDate).month - zombie.creationDate.month)
                    + 1 AS months,
                messageCount
            WHERE messageCount / months < 1
            WITH
                country,
                collect(zombie) AS zombies
            UNWIND zombies AS zombie
            OPTIONAL MATCH
                (zombie)<-[:HAS_CREATOR]-(message:Message)<-[:LIKES]-(likerZombie:Person)
            WHERE likerZombie IN zombies
            WITH
                zombie,
                count(likerZombie) AS zombieLikeCount
            OPTIONAL MATCH
                (zombie)<-[:HAS_CREATOR]-(message:Message)<-[:LIKES]-(likerPerson:Person)
            WHERE likerPerson.creationDate < DateTime($endDate)
            WITH
                zombie,
                zombieLikeCount,
                count(likerPerson) AS totalLikeCount
            RETURN
                zombie.id,
                zombieLikeCount,
                totalLikeCount,
            CASE totalLikeCount
            WHEN 0 THEN 0.0
            ELSE zombieLikeCount / toFloat(totalLikeCount)
            END AS zombieScore
            ORDER BY
                zombieScore DESC,
                zombie.id ASC
            LIMIT 100
            """.replace(
                "\n", ""
            ),
            self._get_query_parameters(),
        )

        if self._vendor == "memgraph":
            return memgraph
        else:
            return neo4j

    def benchmark__bi__query_14_analytical(self):
        return (
            """
            MATCH
                (country1:Country {name: $country1})<-[:IS_PART_OF]-(city1:City)<-[:IS_LOCATED_IN]-(person1:Person),
                (country2:Country {name: $country2})<-[:IS_PART_OF]-(city2:City)<-[:IS_LOCATED_IN]-(person2:Person),
                (person1)-[:KNOWS]-(person2)
            WITH person1, person2, city1, 0 AS score
            OPTIONAL MATCH (person1)<-[:HAS_CREATOR]-(c:Comment)-[:REPLY_OF]->(:Message)-[:HAS_CREATOR]->(person2)
            WITH DISTINCT person1, person2, city1, score + (CASE c WHEN null THEN 0 ELSE  4 END) AS score
            OPTIONAL MATCH (person1)<-[:HAS_CREATOR]-(m:Message)<-[:REPLY_OF]-(:Comment)-[:HAS_CREATOR]->(person2)
            WITH DISTINCT person1, person2, city1, score + (CASE m WHEN null THEN 0 ELSE  1 END) AS score
            OPTIONAL MATCH (person1)-[:LIKES]->(m:Message)-[:HAS_CREATOR]->(person2)
            WITH DISTINCT person1, person2, city1, score + (CASE m WHEN null THEN 0 ELSE 10 END) AS score
            OPTIONAL MATCH (person1)<-[:HAS_CREATOR]-(m:Message)<-[:LIKES]-(person2)
            WITH DISTINCT person1, person2, city1, score + (CASE m WHEN null THEN 0 ELSE  1 END) AS score
            ORDER BY
                city1.name ASC,
                score DESC,
                person1.id ASC,
                person2.id ASC
            WITH city1, collect({score: score, person1Id: person1.id, person2Id: person2.id})[0] AS top
            RETURN
                top.person1Id,
                top.person2Id,
                city1.name,
                top.score
            ORDER BY
                top.score DESC,
                top.person1Id ASC,
                top.person2Id ASC
            LIMIT 100
            """.replace(
                "\n", ""
            ),
            self._get_query_parameters(),
        )

    def benchmark__bi__query_18_analytical(self):
        memgraph = (
            """
            MATCH (tag:Tag {name: $tag})<-[:HAS_INTEREST]-(person1:Person)-[:KNOWS]-(mutualFriend:Person)-[:KNOWS]-(person2:Person)-[:HAS_INTEREST]->(tag)
            OPTIONAL MATCH (person1)-[:KNOWS]-(person2)
            WHERE person1 <> person2
            RETURN person1.id AS person1Id, person2.id AS person2Id, count(DISTINCT mutualFriend) AS mutualFriendCount
            ORDER BY mutualFriendCount DESC, person1Id ASC, person2Id ASC
            LIMIT 20
            """.replace(
                "\n", ""
            ),
            self._get_query_parameters(),
        )

        neo4j = (
            """
            MATCH (tag:Tag {name: $tag})<-[:HAS_INTEREST]-(person1:Person)-[:KNOWS]-(mutualFriend:Person)-[:KNOWS]-(person2:Person)-[:HAS_INTEREST]->(tag)
            WHERE person1 <> person2
                AND NOT (person1)-[:KNOWS]-(person2)
            RETURN person1.id AS person1Id, person2.id AS person2Id, count(DISTINCT mutualFriend) AS mutualFriendCount
            ORDER BY mutualFriendCount DESC, person1Id ASC, person2Id ASC
            LIMIT 20
            """.replace(
                "\n", ""
            ),
            self._get_query_parameters(),
        )
        if self._vendor == "memgraph":
            return memgraph
        else:
            return neo4j