memgraph/tests/mgbench/graph_bench.py

#!/usr/bin/env python3

# Copyright 2023 Memgraph Ltd.
#
# Use of this software is governed by the Business Source License
# included in the file licenses/BSL.txt; by using this file, you agree to be bound by the terms of the Business Source
# License, and you may not use this file except in compliance with the Business Source License.
#
# As of the Change Date specified in that file, in accordance with
# the Business Source License, use of this software will be governed
# by the Apache License, Version 2.0, included in the file
# licenses/APL.txt.

import argparse
import json
import subprocess
from pathlib import Path


def parse_arguments():
    parser = argparse.ArgumentParser(
        description="Run graph database benchmarks on supported databases(Memgraph and Neo4j)",
    )
    parser.add_argument(
        "--vendor",
        nargs=2,
        action="append",
        metavar=("vendor_name", "vendor_binary"),
        help="Forward name and paths to vendors binary"
        "Example: --vendor memgraph /path/to/binary --vendor neo4j /path/to/binary",
    )

    parser.add_argument(
        "--dataset-name",
        default="",
        help="Dataset name you wish to execute",
    )

    parser.add_argument(
        "--dataset-size",
        default="",
        help="Pick a dataset variant you wish to execute",
    )

    parser.add_argument("--dataset-group", default="", help="Select a group of queries")

    parser.add_argument(
        "--realistic",
        nargs=5,
        action="append",
        metavar=("num_of_queries", "write", "read", "update", "analytical"),
        help="Forward config for group run",
    )

    parser.add_argument(
        "--mixed",
        nargs=6,
        action="append",
        metavar=(
            "num_of_queries",
            "write",
            "read",
            "update",
            "analytical",
            "query_percentage",
        ),
        help="Forward config for query",
    )

    parser.add_argument(
        "--num-workers-for-benchmark",
        type=int,
        default=12,
        help="number of workers used to execute the benchmark",
    )

    parser.add_argument(
        "--query-count-lower-bound",
        type=int,
        default=300,
        help="number of workers used to execute the benchmark (works only for isolated run)",
    )

    parser.add_argument(
        "--single-threaded-runtime-sec",
        type=int,
        default=30,
        help="Duration of single threaded benchmark per query (works only for isolated run)",
    )

    args = parser.parse_args()

    return args


def run_full_benchmarks(
    vendor,
    binary,
    dataset,
    dataset_size,
    dataset_group,
    realistic,
    mixed,
    workers,
    query_count_lower_bound,
    single_threaded_runtime_sec,
):
    configurations = [
        # Basic isolated test cold
        [
            "--export-results",
            vendor + "_" + str(workers) + "_" + dataset + "_" + dataset_size + "_cold_isolated.json",
        ],
        # Basic isolated test hot
        [
            "--export-results",
            vendor + "_" + str(workers) + "_" + dataset + "_" + dataset_size + "_hot_isolated.json",
            "--warm-up",
            "hot",
        ],
        # Basic isolated test vulcanic
        [
            "--export-results",
            vendor + "_" + str(workers) + "_" + dataset + "_" + dataset_size + "_vulcanic_isolated.json",
            "--warm-up",
            "vulcanic",
        ],
    ]

    assert not realistic or not mixed, "Cannot run both realistic and mixed workload, please select one!"

    if realistic:
        # Configurations for full workload
        for count, write, read, update, analytical in realistic:
            cold = [
                "--export-results",
                vendor
                + "_"
                + str(workers)
                + "_"
                + dataset
                + "_"
                + dataset_size
                + "_cold_realistic_{}_{}_{}_{}_{}.json".format(count, write, read, update, analytical),
                "--workload-realistic",
                count,
                write,
                read,
                update,
                analytical,
            ]

            hot = [
                "--export-results",
                vendor
                + "_"
                + str(workers)
                + "_"
                + dataset
                + "_"
                + dataset_size
                + "_hot_realistic_{}_{}_{}_{}_{}.json".format(count, write, read, update, analytical),
                "--warm-up",
                "hot",
                "--workload-realistic",
                count,
                write,
                read,
                update,
                analytical,
            ]

            configurations.append(cold)
            configurations.append(hot)

    if mixed:
        # Configurations for workload per query
        for count, write, read, update, analytical, query in mixed:
            cold = [
                "--export-results",
                vendor
                + "_"
                + str(workers)
                + "_"
                + dataset
                + "_"
                + dataset_size
                + "_cold_mixed_{}_{}_{}_{}_{}_{}.json".format(count, write, read, update, analytical, query),
                "--workload-mixed",
                count,
                write,
                read,
                update,
                analytical,
                query,
            ]
            hot = [
                "--export-results",
                vendor
                + "_"
                + str(workers)
                + "_"
                + dataset
                + "_"
                + dataset_size
                + "_hot_mixed_{}_{}_{}_{}_{}_{}.json".format(count, write, read, update, analytical, query),
                "--warm-up",
                "hot",
                "--workload-mixed",
                count,
                write,
                read,
                update,
                analytical,
                query,
            ]

            configurations.append(cold)
            configurations.append(hot)

    default_args = [
        "python3",
        "benchmark.py",
        "vendor-native",
        "--vendor-binary",
        binary,
        "--vendor-name",
        vendor,
        "--num-workers-for-benchmark",
        str(workers),
        "--single-threaded-runtime-sec",
        str(single_threaded_runtime_sec),
        "--query-count-lower-bound",
        str(query_count_lower_bound),
        "--no-authorization",
        dataset + "/" + dataset_size + "/" + dataset_group + "/*",
    ]

    for config in configurations:
        full_config = default_args + config
        print(full_config)
        subprocess.run(args=full_config, check=True)


def collect_all_results(vendor_name, dataset, dataset_size, dataset_group, workers):
    working_directory = Path().absolute()
    print(working_directory)
    results = sorted(
        working_directory.glob(vendor_name + "_" + str(workers) + "_" + dataset + "_" + dataset_size + "_*.json")
    )
    summary = {dataset: {dataset_size: {dataset_group: {}}}}

    for file in results:
        if "summary" in file.name:
            continue
        f = file.open()
        data = json.loads(f.read())
        if data["__run_configuration__"]["condition"] == "hot":
            for key, value in data[dataset][dataset_size][dataset_group].items():
                key_condition = key + "_hot"
                summary[dataset][dataset_size][dataset_group][key_condition] = value
        elif data["__run_configuration__"]["condition"] == "cold":
            for key, value in data[dataset][dataset_size][dataset_group].items():
                key_condition = key + "_cold"
                summary[dataset][dataset_size][dataset_group][key_condition] = value
        elif data["__run_configuration__"]["condition"] == "vulcanic":
            for key, value in data[dataset][dataset_size][dataset_group].items():
                key_condition = key + "_vulcanic"
                summary[dataset][dataset_size][dataset_group][key_condition] = value
    print(summary)

    json_object = json.dumps(summary, indent=4)
    print(json_object)
    with open(vendor_name + "_" + str(workers) + "_" + dataset + "_" + dataset_size + "_summary.json", "w") as f:
        json.dump(summary, f)


if __name__ == "__main__":
    args = parse_arguments()

    realistic = args.realistic
    mixed = args.mixed

    vendor_names = {"memgraph", "neo4j"}
    for vendor_name, vendor_binary in args.vendor:
        path = Path(vendor_binary)
        if vendor_name.lower() in vendor_names and path.is_file():
            run_full_benchmarks(
                vendor_name,
                vendor_binary,
                args.dataset_name,
                args.dataset_size,
                args.dataset_group,
                realistic,
                mixed,
                args.num_workers_for_benchmark,
                args.query_count_lower_bound,
                args.single_threaded_runtime_sec,
            )
            collect_all_results(
                vendor_name, args.dataset_name, args.dataset_size, args.dataset_group, args.num_workers_for_benchmark
            )
        else:
            raise Exception(
                "Check that vendor: {} is supported and you are passing right path: {} to binary.".format(
                    vendor_name, path
                )
            )