memgraph/tests/e2e/analyze_graph/optimize_indexes.py

# Copyright 2023 Memgraph Ltd.
#
# Use of this software is governed by the Business Source License
# included in the file licenses/BSL.txt; by using this file, you agree to be bound by the terms of the Business Source
# License, and you may not use this file except in compliance with the Business Source License.
#
# As of the Change Date specified in that file, in accordance with
# the Business Source License, use of this software will be governed
# by the Apache License, Version 2.0, included in the file
# licenses/APL.txt.

import sys

import pytest
from common import connect, execute_and_fetch_all

# E2E tests for checking query semantic
# ------------------------------------


@pytest.mark.parametrize(
    "delete_query",
    [
        "ANALYZE GRAPH DELETE STATISTICS",
        "ANALYZE GRAPH ON LABELS * DELETE STATISTICS",
        "ANALYZE GRAPH ON LABELS :Label DELETE STATISTICS",
        "ANALYZE GRAPH ON LABELS :Label, :NONEXISTING DELETE STATISTICS",
    ],
)
def test_analyze_graph_delete_statistics(delete_query, connect):
    """Tests that all variants of delete queries work as expected."""
    cursor = connect.cursor()
    execute_and_fetch_all(cursor, "FOREACH (i IN range(1, 100) | CREATE (n:Label {id1: i}));")
    execute_and_fetch_all(cursor, "FOREACH (i IN range(1, 50) | CREATE (n:Label {id2: i % 5}));")
    execute_and_fetch_all(cursor, "CREATE INDEX ON :Label(id1);")
    execute_and_fetch_all(cursor, "CREATE INDEX ON :Label(id2);")
    analyze_graph_results = execute_and_fetch_all(cursor, "ANALYZE GRAPH")
    assert len(analyze_graph_results) == 2
    delete_stats_results = execute_and_fetch_all(cursor, delete_query)
    assert len(delete_stats_results) == 2
    if delete_stats_results[0][1] == "id1":
        first_index = 0
    else:
        first_index = 1
    assert delete_stats_results[first_index] == ("Label", "id1")
    assert delete_stats_results[1 - first_index] == ("Label", "id2")
    # After deleting statistics, id2 should be chosen because it has less vertices
    expected_explain_after_delete_analysis = [
        (f" * Produce {{n}}",),
        (f" * Filter",),
        (f" * ScanAllByLabelPropertyValue (n :Label {{id2}})",),
        (f" * Once",),
    ]
    assert (
        execute_and_fetch_all(cursor, "EXPLAIN MATCH (n:Label) WHERE n.id2 = 3 AND n.id1 = 3 RETURN n;")
        == expected_explain_after_delete_analysis
    )
    execute_and_fetch_all(cursor, "DROP INDEX ON :Label(id1);")
    execute_and_fetch_all(cursor, "DROP INDEX ON :Label(id2);")


@pytest.mark.parametrize(
    "analyze_query",
    [
        "ANALYZE GRAPH",
        "ANALYZE GRAPH ON LABELS *",
        "ANALYZE GRAPH ON LABELS :Label",
        "ANALYZE GRAPH ON LABELS :Label, :NONEXISTING",
    ],
)
def test_analyze_full_graph(analyze_query, connect):
    """Tests analyzing full graph and choosing better index based on the smaller average group size.
    It also tests querying based on labels and that nothing bad will happen by providing non-existing label.
    """
    cursor = connect.cursor()
    execute_and_fetch_all(cursor, "FOREACH (i IN range(1, 100) | CREATE (n:Label {id1: i}));")
    execute_and_fetch_all(cursor, "FOREACH (i IN range(1, 50) | CREATE (n:Label {id2: i % 5}));")
    execute_and_fetch_all(cursor, "CREATE INDEX ON :Label(id1);")
    execute_and_fetch_all(cursor, "CREATE INDEX ON :Label(id2);")
    # Choose id2 before tha analysis because it has less vertices
    expected_explain_before_analysis = [
        (f" * Produce {{n}}",),
        (f" * Filter",),
        (f" * ScanAllByLabelPropertyValue (n :Label {{id2}})",),
        (f" * Once",),
    ]
    assert (
        execute_and_fetch_all(cursor, "EXPLAIN MATCH (n:Label) WHERE n.id2 = 3 AND n.id1 = 3 RETURN n;")
        == expected_explain_before_analysis
    )
    # Run analyze query
    analyze_graph_results = execute_and_fetch_all(cursor, analyze_query)
    assert len(analyze_graph_results) == 2
    if analyze_graph_results[0][1] == "id1":
        first_index = 0
    else:
        first_index = 1
    # Check results
    assert analyze_graph_results[first_index] == ("Label", "id1", 100, 100, 1, 0)
    assert analyze_graph_results[1 - first_index] == ("Label", "id2", 50, 5, 10, 0)
    # After analyzing graph, id1 index should be chosen because it has smaller average group size
    expected_explain_after_analysis = [
        (f" * Produce {{n}}",),
        (f" * Filter",),
        (f" * ScanAllByLabelPropertyValue (n :Label {{id1}})",),
        (f" * Once",),
    ]
    assert (
        execute_and_fetch_all(cursor, "EXPLAIN MATCH (n:Label) WHERE n.id2 = 3 AND n.id1 = 3 RETURN n;")
        == expected_explain_after_analysis
    )
    assert len(execute_and_fetch_all(cursor, "ANALYZE GRAPH DELETE STATISTICS")) == 2
    execute_and_fetch_all(cursor, "DROP INDEX ON :Label(id1);")
    execute_and_fetch_all(cursor, "DROP INDEX ON :Label(id2);")


# Explicit index choosing tests
# -----------------------------


def test_cardinality_different_avg_group_size_uniform_dist(connect):
    """Tests index optimization with indices both having uniform distribution but one has smaller avg. group size."""
    cursor = connect.cursor()
    execute_and_fetch_all(cursor, "FOREACH (i IN range(1, 100) | CREATE (n:Label {id1: i}));")
    execute_and_fetch_all(cursor, "FOREACH (i IN range(1, 100) | CREATE (n:Label {id2: i % 20}));")
    execute_and_fetch_all(cursor, "CREATE INDEX ON :Label(id1);")
    execute_and_fetch_all(cursor, "CREATE INDEX ON :Label(id2);")
    analyze_graph_results = execute_and_fetch_all(cursor, "ANALYZE GRAPH")
    if analyze_graph_results[0][1] == "id1":
        first_index = 0
    else:
        first_index = 1
    # Check results
    assert analyze_graph_results[first_index] == ("Label", "id1", 100, 100, 1, 0)
    assert analyze_graph_results[1 - first_index] == ("Label", "id2", 100, 20, 5, 0)
    expected_explain_after_analysis = [
        (f" * Produce {{n}}",),
        (f" * Filter",),
        (f" * ScanAllByLabelPropertyValue (n :Label {{id1}})",),
        (f" * Once",),
    ]
    assert (
        execute_and_fetch_all(cursor, "EXPLAIN MATCH (n:Label) WHERE n.id2 = 3 AND n.id1 = 3 RETURN n;")
        == expected_explain_after_analysis
    )
    assert len(execute_and_fetch_all(cursor, "ANALYZE GRAPH DELETE STATISTICS")) == 2
    execute_and_fetch_all(cursor, "DROP INDEX ON :Label(id1);")
    execute_and_fetch_all(cursor, "DROP INDEX ON :Label(id2);")


def test_cardinality_same_avg_group_size_uniform_dist_diff_vertex_count(connect):
    """Tests index choosing where both indices have uniform key distribution with same avg. group size but one has less vertices."""
    cursor = connect.cursor()
    execute_and_fetch_all(cursor, "FOREACH (i IN range(1, 100) | CREATE (n:Label {id1: i}));")
    execute_and_fetch_all(cursor, "FOREACH (i IN range(1, 50) | CREATE (n:Label {id2: i}));")
    execute_and_fetch_all(cursor, "CREATE INDEX ON :Label(id1);")
    execute_and_fetch_all(cursor, "CREATE INDEX ON :Label(id2);")
    analyze_graph_results = execute_and_fetch_all(cursor, "ANALYZE GRAPH")
    if analyze_graph_results[0][1] == "id1":
        first_index = 0
    else:
        first_index = 1
    # Check results
    assert analyze_graph_results[first_index] == ("Label", "id1", 100, 100, 1, 0)
    assert analyze_graph_results[1 - first_index] == ("Label", "id2", 50, 50, 1, 0)
    expected_explain_after_analysis = [
        (f" * Produce {{n}}",),
        (f" * Filter",),
        (f" * ScanAllByLabelPropertyValue (n :Label {{id2}})",),
        (f" * Once",),
    ]
    assert (
        execute_and_fetch_all(cursor, "EXPLAIN MATCH (n:Label) WHERE n.id2 = 3 AND n.id1 = 3 RETURN n;")
        == expected_explain_after_analysis
    )
    assert len(execute_and_fetch_all(cursor, "ANALYZE GRAPH DELETE STATISTICS")) == 2
    execute_and_fetch_all(cursor, "DROP INDEX ON :Label(id1);")
    execute_and_fetch_all(cursor, "DROP INDEX ON :Label(id2);")


def test_large_diff_in_num_vertices_v1(connect):
    """Tests that when one index has > 10x vertices than the other one, it should be chosen no matter avg group size and uniform distribution."""
    cursor = connect.cursor()
    execute_and_fetch_all(cursor, "FOREACH (i IN range(1, 1000) | CREATE (n:Label {id1: i}));")
    execute_and_fetch_all(cursor, "FOREACH (i IN range(1, 99) | CREATE (n:Label {id2: 1}));")
    execute_and_fetch_all(cursor, "CREATE INDEX ON :Label(id1);")
    execute_and_fetch_all(cursor, "CREATE INDEX ON :Label(id2);")
    analyze_graph_results = execute_and_fetch_all(cursor, "ANALYZE GRAPH")
    if analyze_graph_results[0][1] == "id1":
        first_index = 0
    else:
        first_index = 1
    # Check results
    assert analyze_graph_results[first_index] == ("Label", "id1", 1000, 1000, 1, 0)
    assert analyze_graph_results[1 - first_index] == ("Label", "id2", 99, 1, 99, 0)
    expected_explain_after_analysis = [
        (f" * Produce {{n}}",),
        (f" * Filter",),
        (f" * ScanAllByLabelPropertyValue (n :Label {{id2}})",),
        (f" * Once",),
    ]
    assert (
        execute_and_fetch_all(cursor, "EXPLAIN MATCH (n:Label) WHERE n.id2 = 3 AND n.id1 = 3 RETURN n;")
        == expected_explain_after_analysis
    )
    assert len(execute_and_fetch_all(cursor, "ANALYZE GRAPH DELETE STATISTICS")) == 2
    execute_and_fetch_all(cursor, "DROP INDEX ON :Label(id1);")
    execute_and_fetch_all(cursor, "DROP INDEX ON :Label(id2);")


def test_large_diff_in_num_vertices_v2(connect):
    """Tests that when one index has > 10x vertices than the other one, it should be chosen no matter avg group size and uniform distribution."""
    cursor = connect.cursor()
    execute_and_fetch_all(cursor, "FOREACH (i IN range(1, 99) | CREATE (n:Label {id1: 1}));")
    execute_and_fetch_all(cursor, "FOREACH (i IN range(1, 1000) | CREATE (n:Label {id2: i}));")
    execute_and_fetch_all(cursor, "CREATE INDEX ON :Label(id1);")
    execute_and_fetch_all(cursor, "CREATE INDEX ON :Label(id2);")
    analyze_graph_results = execute_and_fetch_all(cursor, "ANALYZE GRAPH")
    if analyze_graph_results[0][1] == "id1":
        first_index = 0
    else:
        first_index = 1
    # Check results
    assert analyze_graph_results[first_index] == ("Label", "id1", 99, 1, 99, 0)
    assert analyze_graph_results[1 - first_index] == ("Label", "id2", 1000, 1000, 1, 0)
    expected_explain_after_analysis = [
        (f" * Produce {{n}}",),
        (f" * Filter",),
        (f" * ScanAllByLabelPropertyValue (n :Label {{id1}})",),
        (f" * Once",),
    ]
    assert (
        execute_and_fetch_all(cursor, "EXPLAIN MATCH (n:Label) WHERE n.id2 = 3 AND n.id1 = 3 RETURN n;")
        == expected_explain_after_analysis
    )
    assert len(execute_and_fetch_all(cursor, "ANALYZE GRAPH DELETE STATISTICS")) == 2
    execute_and_fetch_all(cursor, "DROP INDEX ON :Label(id1);")
    execute_and_fetch_all(cursor, "DROP INDEX ON :Label(id2);")


def test_same_avg_group_size_diff_distribution(connect):
    """Tests index choice decision based on key distribution."""
    cursor = connect.cursor()
    # Setup first key distribution
    execute_and_fetch_all(cursor, "FOREACH (i IN range(1, 10) | CREATE (n:Label {id1: 1}));")
    execute_and_fetch_all(cursor, "FOREACH (i IN range(1, 30) | CREATE (n:Label {id1: 2}));")
    execute_and_fetch_all(cursor, "FOREACH (i IN range(1, 20) | CREATE (n:Label {id1: 3}));")
    execute_and_fetch_all(cursor, "FOREACH (i IN range(1, 35) | CREATE (n:Label {id1: 4}));")
    execute_and_fetch_all(cursor, "FOREACH (i IN range(1, 5) | CREATE (n:Label  {id1: 5}));")
    # Setup second key distribution
    execute_and_fetch_all(cursor, "FOREACH (i IN range(1, 20) | CREATE (n:Label {id2: 1}));")
    execute_and_fetch_all(cursor, "FOREACH (i IN range(1, 20) | CREATE (n:Label {id2: 2}));")
    execute_and_fetch_all(cursor, "FOREACH (i IN range(1, 20) | CREATE (n:Label {id2: 3}));")
    execute_and_fetch_all(cursor, "FOREACH (i IN range(1, 20) | CREATE (n:Label {id2: 4}));")
    execute_and_fetch_all(cursor, "FOREACH (i IN range(1, 20) | CREATE (n:Label {id2: 5}));")
    execute_and_fetch_all(cursor, "CREATE INDEX ON :Label(id1);")
    execute_and_fetch_all(cursor, "CREATE INDEX ON :Label(id2);")
    analyze_graph_results = execute_and_fetch_all(cursor, "ANALYZE GRAPH")
    if analyze_graph_results[0][1] == "id1":
        first_index = 0
    else:
        first_index = 1
    # Check results
    assert analyze_graph_results[first_index] == ("Label", "id1", 100, 5, 20, 32.5)
    assert analyze_graph_results[1 - first_index] == ("Label", "id2", 100, 5, 20, 0)
    expected_explain_after_analysis = [
        (f" * Produce {{n}}",),
        (f" * Filter",),
        (f" * ScanAllByLabelPropertyValue (n :Label {{id2}})",),
        (f" * Once",),
    ]
    assert (
        execute_and_fetch_all(cursor, "EXPLAIN MATCH (n:Label) WHERE n.id2 = 3 AND n.id1 = 3 RETURN n;")
        == expected_explain_after_analysis
    )
    assert len(execute_and_fetch_all(cursor, "ANALYZE GRAPH DELETE STATISTICS")) == 2
    execute_and_fetch_all(cursor, "DROP INDEX ON :Label(id1);")
    execute_and_fetch_all(cursor, "DROP INDEX ON :Label(id2);")


if __name__ == "__main__":
    sys.exit(pytest.main([__file__, "-rA"]))