diff --git a/src/auth/models.cpp b/src/auth/models.cpp index 18574d369..24f8ab2d7 100644 --- a/src/auth/models.cpp +++ b/src/auth/models.cpp @@ -16,6 +16,7 @@ #include "auth/crypto.hpp" #include "auth/exceptions.hpp" #include "license/license.hpp" +#include "query/constants.hpp" #include "utils/cast.hpp" #include "utils/logging.hpp" #include "utils/settings.hpp" @@ -270,7 +271,7 @@ PermissionLevel FineGrainedAccessPermissions::Has(const std::string &permission, void FineGrainedAccessPermissions::Grant(const std::string &permission, const FineGrainedPermission fine_grained_permission) { - if (permission == kAsterisk) { + if (permission == query::kAsterisk) { global_permission_ = CalculateGrant(fine_grained_permission); } else { permissions_[permission] = CalculateGrant(fine_grained_permission); @@ -278,7 +279,7 @@ void FineGrainedAccessPermissions::Grant(const std::string &permission, } void FineGrainedAccessPermissions::Revoke(const std::string &permission) { - if (permission == kAsterisk) { + if (permission == query::kAsterisk) { permissions_.clear(); global_permission_ = std::nullopt; } else { diff --git a/src/auth/models.hpp b/src/auth/models.hpp index b902b6960..85a1ae31d 100644 --- a/src/auth/models.hpp +++ b/src/auth/models.hpp @@ -15,7 +15,6 @@ #include namespace memgraph::auth { -const std::string kAsterisk = "*"; // These permissions must have values that are applicable for usage in a // bitmask. // clang-format off diff --git a/src/glue/auth_checker.cpp b/src/glue/auth_checker.cpp index debdc0f5b..011a4bb3b 100644 --- a/src/glue/auth_checker.cpp +++ b/src/glue/auth_checker.cpp @@ -15,6 +15,7 @@ #include "auth/models.hpp" #include "glue/auth.hpp" #include "license/license.hpp" +#include "query/constants.hpp" #include "query/frontend/ast/ast.hpp" #include "utils/synchronized.hpp" @@ -38,7 +39,7 @@ bool IsUserAuthorizedGloballyLabels(const memgraph::auth::User &user, if (!memgraph::license::global_license_checker.IsEnterpriseValidFast()) { return true; } - return user.GetFineGrainedAccessLabelPermissions().Has(memgraph::auth::kAsterisk, fine_grained_permission) == + return user.GetFineGrainedAccessLabelPermissions().Has(memgraph::query::kAsterisk, fine_grained_permission) == memgraph::auth::PermissionLevel::GRANT; } @@ -47,7 +48,7 @@ bool IsUserAuthorizedGloballyEdges(const memgraph::auth::User &user, if (!memgraph::license::global_license_checker.IsEnterpriseValidFast()) { return true; } - return user.GetFineGrainedAccessEdgeTypePermissions().Has(memgraph::auth::kAsterisk, fine_grained_permission) == + return user.GetFineGrainedAccessEdgeTypePermissions().Has(memgraph::query::kAsterisk, fine_grained_permission) == memgraph::auth::PermissionLevel::GRANT; } diff --git a/src/glue/auth_handler.cpp b/src/glue/auth_handler.cpp index b08d47dfe..12cc533b7 100644 --- a/src/glue/auth_handler.cpp +++ b/src/glue/auth_handler.cpp @@ -1,4 +1,4 @@ -// Copyright 2022 Memgraph Ltd. +// Copyright 2023 Memgraph Ltd. // // Use of this software is governed by the Business Source License // included in the file licenses/BSL.txt; by using this file, you agree to be bound by the terms of the Business Source @@ -18,6 +18,7 @@ #include "auth/models.hpp" #include "glue/auth.hpp" #include "license/license.hpp" +#include "query/constants.hpp" namespace { @@ -253,19 +254,18 @@ bool AuthQueryHandler::CreateUser(const std::string &username, const std::option if (first_user) { spdlog::info("{} is first created user. Granting all privileges.", username); - GrantPrivilege(username, memgraph::query::kPrivilegesAll + GrantPrivilege( + username, memgraph::query::kPrivilegesAll #ifdef MG_ENTERPRISE - , - {{{memgraph::query::AuthQuery::FineGrainedPrivilege::CREATE_DELETE, {memgraph::auth::kAsterisk}}}}, - { - { - { - memgraph::query::AuthQuery::FineGrainedPrivilege::CREATE_DELETE, { - memgraph::auth::kAsterisk - } - } - } - } + , + {{{memgraph::query::AuthQuery::FineGrainedPrivilege::CREATE_DELETE, {memgraph::query::kAsterisk}}}}, + { + { + { + memgraph::query::AuthQuery::FineGrainedPrivilege::CREATE_DELETE, { memgraph::query::kAsterisk } + } + } + } #endif ); } diff --git a/src/query/constants.hpp b/src/query/constants.hpp index 805739157..5a563524d 100644 --- a/src/query/constants.hpp +++ b/src/query/constants.hpp @@ -1,4 +1,4 @@ -// Copyright 2022 Memgraph Ltd. +// Copyright 2023 Memgraph Ltd. // // Use of this software is governed by the Business Source License // included in the file licenses/BSL.txt; by using this file, you agree to be bound by the terms of the Business Source @@ -16,4 +16,6 @@ namespace memgraph::query { inline constexpr uint16_t kDefaultReplicationPort = 10000; inline constexpr auto *kDefaultReplicationServerIp = "0.0.0.0"; +inline const std::string kAsterisk = "*"; +inline constexpr uint16_t kDeleteStatisticsNumResults = 6; } // namespace memgraph::query diff --git a/src/query/db_accessor.hpp b/src/query/db_accessor.hpp index 91c2ec721..564f4b2c4 100644 --- a/src/query/db_accessor.hpp +++ b/src/query/db_accessor.hpp @@ -430,6 +430,25 @@ class DbAccessor final { return accessor_->LabelPropertyIndexExists(label, prop); } + std::optional GetIndexStats(const storage::LabelId &label, + const storage::PropertyId &property) const { + return accessor_->GetIndexStats(label, property); + } + + std::vector> ClearIndexStats() { + return accessor_->ClearIndexStats(); + } + + std::vector> DeleteIndexStatsForLabels( + const std::span labels) { + return accessor_->DeleteIndexStatsForLabels(labels); + } + + void SetIndexStats(const storage::LabelId &label, const storage::PropertyId &property, + const storage::IndexStats &stats) { + accessor_->SetIndexStats(label, property, stats); + } + int64_t VerticesCount() const { return accessor_->ApproximateVertexCount(); } int64_t VerticesCount(storage::LabelId label) const { return accessor_->ApproximateVertexCount(label); } diff --git a/src/query/exceptions.hpp b/src/query/exceptions.hpp index e48b3e525..659f8e9e5 100644 --- a/src/query/exceptions.hpp +++ b/src/query/exceptions.hpp @@ -229,6 +229,12 @@ class VersionInfoInMulticommandTxException : public QueryException { : QueryException("Version info query not allowed in multicommand transactions.") {} }; +class AnalyzeGraphInMulticommandTxException : public QueryException { + public: + AnalyzeGraphInMulticommandTxException() + : QueryException("Analyze graph query not allowed in multicommand transactions.") {} +}; + class ReplicationException : public utils::BasicException { public: using utils::BasicException::BasicException; diff --git a/src/query/frontend/ast/ast.cpp b/src/query/frontend/ast/ast.cpp index 6f7b0bddd..aaea154b9 100644 --- a/src/query/frontend/ast/ast.cpp +++ b/src/query/frontend/ast/ast.cpp @@ -260,6 +260,9 @@ constexpr utils::TypeInfo query::Foreach::kType{utils::TypeId::AST_FOREACH, "For constexpr utils::TypeInfo query::ShowConfigQuery::kType{utils::TypeId::AST_SHOW_CONFIG_QUERY, "ShowConfigQuery", &query::Query::kType}; +constexpr utils::TypeInfo query::AnalyzeGraphQuery::kType{utils::TypeId::AST_ANALYZE_GRAPH_QUERY, "AnalyzeGraphQuery", + &query::Query::kType}; + constexpr utils::TypeInfo query::TransactionQueueQuery::kType{utils::TypeId::AST_TRANSACTION_QUEUE_QUERY, "TransactionQueueQuery", &query::Query::kType}; diff --git a/src/query/frontend/ast/ast.hpp b/src/query/frontend/ast/ast.hpp index a2b7bd9f4..1332d117b 100644 --- a/src/query/frontend/ast/ast.hpp +++ b/src/query/frontend/ast/ast.hpp @@ -3230,6 +3230,26 @@ class TransactionQueueQuery : public memgraph::query::Query { } }; +class AnalyzeGraphQuery : public memgraph::query::Query { + public: + static const utils::TypeInfo kType; + const utils::TypeInfo &GetTypeInfo() const override { return kType; } + + DEFVISITABLE(QueryVisitor); + + enum class Action { ANALYZE, DELETE }; + + memgraph::query::AnalyzeGraphQuery::Action action_; + std::vector labels_; + + AnalyzeGraphQuery *Clone(AstStorage *storage) const override { + auto *object = storage->Create(); + object->action_ = action_; + object->labels_ = labels_; + return object; + } +}; + class Exists : public memgraph::query::Expression { public: static const utils::TypeInfo kType; diff --git a/src/query/frontend/ast/ast.lcp b/src/query/frontend/ast/ast.lcp index 9bc82e31c..c6ed71bbc 100644 --- a/src/query/frontend/ast/ast.lcp +++ b/src/query/frontend/ast/ast.lcp @@ -2403,6 +2403,26 @@ cpp<# (:serialize (:slk)) (:clone)) +(lcp:define-class analyze-graph-query (query) + ((action "Action" :scope :public) + (labels "std::vector" :scope :public)) + + (:public + (lcp:define-enum action + (analyze delete) + (:serialize)) + #>cpp + AnalyzeGraphQuery() = default; + + DEFVISITABLE(QueryVisitor); + cpp<#) + (:private + #>cpp + friend class AstStorage; + cpp<#) + (:serialize (:slk)) + (:clone)) + (lcp:define-class replication-query (query) ((action "Action" :scope :public) (role "ReplicationRole" :scope :public) diff --git a/src/query/frontend/ast/ast_visitor.hpp b/src/query/frontend/ast/ast_visitor.hpp index 81ef7f5c6..2b28008bc 100644 --- a/src/query/frontend/ast/ast_visitor.hpp +++ b/src/query/frontend/ast/ast_visitor.hpp @@ -95,6 +95,7 @@ class SettingQuery; class VersionQuery; class Foreach; class ShowConfigQuery; +class AnalyzeGraphQuery; class TransactionQueueQuery; class Exists; @@ -131,7 +132,7 @@ template class QueryVisitor : public utils::Visitor {}; + IsolationLevelQuery, CreateSnapshotQuery, StreamQuery, SettingQuery, VersionQuery, + ShowConfigQuery, TransactionQueueQuery, AnalyzeGraphQuery> {}; } // namespace memgraph::query diff --git a/src/query/frontend/ast/cypher_main_visitor.cpp b/src/query/frontend/ast/cypher_main_visitor.cpp index 8425f8fe2..ad92cd1cb 100644 --- a/src/query/frontend/ast/cypher_main_visitor.cpp +++ b/src/query/frontend/ast/cypher_main_visitor.cpp @@ -241,6 +241,23 @@ antlrcpp::Any CypherMainVisitor::visitDumpQuery(MemgraphCypher::DumpQueryContext return dump_query; } +antlrcpp::Any CypherMainVisitor::visitAnalyzeGraphQuery(MemgraphCypher::AnalyzeGraphQueryContext *ctx) { + auto *analyze_graph_query = storage_->Create(); + if (ctx->listOfColonSymbolicNames()) { + analyze_graph_query->labels_ = + std::any_cast>(ctx->listOfColonSymbolicNames()->accept(this)); + } else { + analyze_graph_query->labels_.emplace_back("*"); + } + if (ctx->DELETE()) { + analyze_graph_query->action_ = AnalyzeGraphQuery::Action::DELETE; + } else { + analyze_graph_query->action_ = AnalyzeGraphQuery::Action::ANALYZE; + } + query_ = analyze_graph_query; + return analyze_graph_query; +} + antlrcpp::Any CypherMainVisitor::visitReplicationQuery(MemgraphCypher::ReplicationQueryContext *ctx) { MG_ASSERT(ctx->children.size() == 1, "ReplicationQuery should have exactly one child!"); auto *replication_query = std::any_cast(ctx->children[0]->accept(this)); @@ -1441,19 +1458,23 @@ antlrcpp::Any CypherMainVisitor::visitEntityPrivilegeList(MemgraphCypher::Entity return result; } +antlrcpp::Any CypherMainVisitor::visitListOfColonSymbolicNames(MemgraphCypher::ListOfColonSymbolicNamesContext *ctx) { + std::vector symbolic_names; + for (auto *symbolic_name : ctx->colonSymbolicName()) { + symbolic_names.push_back(std::any_cast(symbolic_name->symbolicName()->accept(this))); + } + return symbolic_names; +} + /** * @return std::vector */ antlrcpp::Any CypherMainVisitor::visitEntitiesList(MemgraphCypher::EntitiesListContext *ctx) { std::vector entities; - if (ctx->listOfEntities()) { - for (auto *entity : ctx->listOfEntities()->entity()) { - entities.push_back(std::any_cast(entity->symbolicName()->accept(this))); - } - } else { - entities.emplace_back("*"); + if (ctx->listOfColonSymbolicNames()) { + return ctx->listOfColonSymbolicNames()->accept(this); } - + entities.emplace_back("*"); return entities; } diff --git a/src/query/frontend/ast/cypher_main_visitor.hpp b/src/query/frontend/ast/cypher_main_visitor.hpp index aa37b383f..6995adeea 100644 --- a/src/query/frontend/ast/cypher_main_visitor.hpp +++ b/src/query/frontend/ast/cypher_main_visitor.hpp @@ -183,6 +183,16 @@ class CypherMainVisitor : public antlropencypher::MemgraphCypherBaseVisitor { */ antlrcpp::Any visitDumpQuery(MemgraphCypher::DumpQueryContext *ctx) override; + /** + @return std::vector + */ + antlrcpp::Any visitListOfColonSymbolicNames(MemgraphCypher::ListOfColonSymbolicNamesContext *ctx) override; + + /** + * @return AnalyzeGraphQuery* + */ + antlrcpp::Any visitAnalyzeGraphQuery(MemgraphCypher::AnalyzeGraphQueryContext *ctx) override; + /** * @return ReplicationQuery* */ diff --git a/src/query/frontend/opencypher/grammar/MemgraphCypher.g4 b/src/query/frontend/opencypher/grammar/MemgraphCypher.g4 index c189168c8..5f16a99d3 100644 --- a/src/query/frontend/opencypher/grammar/MemgraphCypher.g4 +++ b/src/query/frontend/opencypher/grammar/MemgraphCypher.g4 @@ -22,6 +22,7 @@ import Cypher ; memgraphCypherKeyword : cypherKeyword | AFTER | ALTER + | ANALYZE | ASYNC | AUTH | BAD @@ -53,6 +54,7 @@ memgraphCypherKeyword : cypherKeyword | FREE | FROM | GLOBAL + | GRAPH | GRANT | HEADER | IDENTIFIED @@ -119,6 +121,7 @@ query : cypherQuery | constraintQuery | authQuery | dumpQuery + | analyzeGraphQuery | replicationQuery | lockPathQuery | freeMemoryQuery @@ -291,11 +294,11 @@ revokePrivilegesList : privilegeOrEntities ( ',' privilegeOrEntities )* ; privilegesList : privilege ( ',' privilege )* ; -entitiesList : ASTERISK | listOfEntities ; +entitiesList : ASTERISK | listOfColonSymbolicNames ; -listOfEntities : entity ( ',' entity )* ; +listOfColonSymbolicNames : colonSymbolicName ( ',' colonSymbolicName )* ; -entity : COLON symbolicName ; +colonSymbolicName : COLON symbolicName ; showPrivileges : SHOW PRIVILEGES FOR userOrRole=userOrRoleName ; @@ -305,6 +308,8 @@ showUsersForRole : SHOW USERS FOR role=userOrRoleName ; dumpQuery: DUMP DATABASE ; +analyzeGraphQuery: ANALYZE GRAPH ( ON LABELS ( listOfColonSymbolicNames | ASTERISK ) ) ? ( DELETE STATISTICS ) ? ; + setReplicationRole : SET REPLICATION ROLE TO ( MAIN | REPLICA ) ( WITH PORT port=literal ) ? ; diff --git a/src/query/frontend/opencypher/grammar/MemgraphCypherLexer.g4 b/src/query/frontend/opencypher/grammar/MemgraphCypherLexer.g4 index 86911785a..e80fa0ae7 100644 --- a/src/query/frontend/opencypher/grammar/MemgraphCypherLexer.g4 +++ b/src/query/frontend/opencypher/grammar/MemgraphCypherLexer.g4 @@ -27,6 +27,7 @@ UNDERSCORE : '_' ; AFTER : A F T E R ; ALTER : A L T E R ; +ANALYZE : A N A L Y Z E ; ASYNC : A S Y N C ; AUTH : A U T H ; BAD : B A D ; @@ -62,6 +63,7 @@ FREE_MEMORY : F R E E UNDERSCORE M E M O R Y ; FROM : F R O M ; GLOBAL : G L O B A L ; GRANT : G R A N T ; +GRAPH : G R A P H ; GRANTS : G R A N T S ; HEADER : H E A D E R ; IDENTIFIED : I D E N T I F I E D ; @@ -99,6 +101,7 @@ SETTING : S E T T I N G ; SETTINGS : S E T T I N G S ; SNAPSHOT : S N A P S H O T ; START : S T A R T ; +STATISTICS : S T A T I S T I C S ; STATS : S T A T S ; STOP : S T O P ; STREAM : S T R E A M ; diff --git a/src/query/frontend/semantic/required_privileges.cpp b/src/query/frontend/semantic/required_privileges.cpp index ffb5b703c..b7dd1dc3c 100644 --- a/src/query/frontend/semantic/required_privileges.cpp +++ b/src/query/frontend/semantic/required_privileges.cpp @@ -25,9 +25,11 @@ class PrivilegeExtractor : public QueryVisitor, public HierarchicalTreeVis std::vector privileges() { return privileges_; } - void Visit(IndexQuery &) override { AddPrivilege(AuthQuery::Privilege::INDEX); } + void Visit(IndexQuery & /*unused*/) override { AddPrivilege(AuthQuery::Privilege::INDEX); } - void Visit(AuthQuery &) override { AddPrivilege(AuthQuery::Privilege::AUTH); } + void Visit(AnalyzeGraphQuery & /*unused*/) override { AddPrivilege(AuthQuery::Privilege::INDEX); } + + void Visit(AuthQuery & /*unused*/) override { AddPrivilege(AuthQuery::Privilege::AUTH); } void Visit(ExplainQuery &query) override { query.cypher_query_->Accept(*this); } diff --git a/src/query/interpreter.cpp b/src/query/interpreter.cpp index 180cc707d..66d572f49 100644 --- a/src/query/interpreter.cpp +++ b/src/query/interpreter.cpp @@ -331,11 +331,11 @@ Callback HandleAuthQuery(AuthQuery *auth_query, AuthQueryHandler *auth, const Pa auth->GrantPrivilege(username, kPrivilegesAll #ifdef MG_ENTERPRISE , - {{{AuthQuery::FineGrainedPrivilege::CREATE_DELETE, {auth::kAsterisk}}}}, + {{{AuthQuery::FineGrainedPrivilege::CREATE_DELETE, {query::kAsterisk}}}}, { { { - AuthQuery::FineGrainedPrivilege::CREATE_DELETE, { auth::kAsterisk } + AuthQuery::FineGrainedPrivilege::CREATE_DELETE, { query::kAsterisk } } } } @@ -1408,6 +1408,139 @@ PreparedQuery PrepareDumpQuery(ParsedQuery parsed_query, std::map> AnalyzeGraphQueryHandler::AnalyzeGraphCreateStatistics( + const std::span labels, DbAccessor *execution_db_accessor) { + using LPIndex = std::pair; + + std::vector> results; + std::map> counter; + + // Preprocess labels to avoid later checks + std::vector indices_info = execution_db_accessor->ListAllIndices().label_property; + if (labels[0] != kAsterisk) { + for (auto it = indices_info.cbegin(); it != indices_info.cend();) { + if (std::find(labels.begin(), labels.end(), execution_db_accessor->LabelToName(it->first)) == labels.end()) { + it = indices_info.erase(it); + } else { + ++it; + } + } + } + // Iterate over all indexed vertices + std::for_each(indices_info.begin(), indices_info.end(), [execution_db_accessor, &counter](const LPIndex &index_info) { + auto vertices = execution_db_accessor->Vertices(storage::View::OLD, index_info.first, index_info.second); + std::for_each(vertices.begin(), vertices.end(), [&index_info, &counter](const auto &vertex) { + counter[index_info][*vertex.GetProperty(storage::View::OLD, index_info.second)]++; + }); + }); + + results.reserve(counter.size()); + std::for_each(counter.begin(), counter.end(), [&results, execution_db_accessor](const auto &counter_entry) { + const auto &[label_property, values_map] = counter_entry; + std::vector result; + result.reserve(kDeleteStatisticsNumResults); + // Extract info + int64_t count_property_value = std::accumulate( + values_map.begin(), values_map.end(), 0, + [](int64_t prev_value, const auto &prop_value_count) { return prev_value + prop_value_count.second; }); + // num_distinc_values will never be 0 + double avg_group_size = static_cast(count_property_value) / static_cast(values_map.size()); + double chi_squared_stat = std::accumulate( + values_map.begin(), values_map.end(), 0.0, [avg_group_size](double prev_result, const auto &value_entry) { + return prev_result + utils::ChiSquaredValue(value_entry.second, avg_group_size); + }); + execution_db_accessor->SetIndexStats( + label_property.first, label_property.second, + storage::IndexStats{.statistic = chi_squared_stat, .avg_group_size = avg_group_size}); + // Save result + result.emplace_back(execution_db_accessor->LabelToName(label_property.first)); + result.emplace_back(execution_db_accessor->PropertyToName(label_property.second)); + result.emplace_back(count_property_value); + result.emplace_back(static_cast(values_map.size())); + result.emplace_back(avg_group_size); + result.emplace_back(chi_squared_stat); + results.push_back(std::move(result)); + }); + return results; +} + +std::vector> AnalyzeGraphQueryHandler::AnalyzeGraphDeleteStatistics( + const std::span labels, DbAccessor *execution_db_accessor) { + std::vector> loc_results; + if (labels[0] == kAsterisk) { + loc_results = execution_db_accessor->ClearIndexStats(); + } else { + loc_results = execution_db_accessor->DeleteIndexStatsForLabels(labels); + } + std::vector> results; + std::transform(loc_results.begin(), loc_results.end(), std::back_inserter(results), + [execution_db_accessor](const auto &label_property_index) { + return std::vector{ + TypedValue(execution_db_accessor->LabelToName(label_property_index.first)), + TypedValue(execution_db_accessor->PropertyToName(label_property_index.second))}; + }); + return results; +} + +Callback HandleAnalyzeGraphQuery(AnalyzeGraphQuery *analyze_graph_query, DbAccessor *execution_db_accessor) { + Callback callback; + switch (analyze_graph_query->action_) { + case AnalyzeGraphQuery::Action::ANALYZE: { + callback.header = {"label", "property", "num estimation nodes", + "num groups", "avg group size", "chi-squared value"}; + callback.fn = [handler = AnalyzeGraphQueryHandler(), labels = analyze_graph_query->labels_, + execution_db_accessor]() mutable { + return handler.AnalyzeGraphCreateStatistics(labels, execution_db_accessor); + }; + break; + } + case AnalyzeGraphQuery::Action::DELETE: { + callback.header = {"label", "property"}; + callback.fn = [handler = AnalyzeGraphQueryHandler(), labels = analyze_graph_query->labels_, + execution_db_accessor]() mutable { + return handler.AnalyzeGraphDeleteStatistics(labels, execution_db_accessor); + }; + break; + } + } + + return callback; +} + +PreparedQuery PrepareAnalyzeGraphQuery(ParsedQuery parsed_query, bool in_explicit_transaction, + DbAccessor *execution_db_accessor, InterpreterContext *interpreter_context) { + if (in_explicit_transaction) { + throw AnalyzeGraphInMulticommandTxException(); + } + + // Creating an index influences computed plan costs. + auto invalidate_plan_cache = [plan_cache = &interpreter_context->plan_cache] { + auto access = plan_cache->access(); + for (auto &kv : access) { + access.remove(kv.first); + } + }; + utils::OnScopeExit cache_invalidator(invalidate_plan_cache); + + auto *analyze_graph_query = utils::Downcast(parsed_query.query); + MG_ASSERT(analyze_graph_query); + auto callback = HandleAnalyzeGraphQuery(analyze_graph_query, execution_db_accessor); + + return PreparedQuery{std::move(callback.header), std::move(parsed_query.required_privileges), + [callback_fn = std::move(callback.fn), pull_plan = std::shared_ptr{nullptr}]( + AnyStream *stream, std::optional n) mutable -> std::optional { + if (UNLIKELY(!pull_plan)) { + pull_plan = std::make_shared(callback_fn()); + } + + if (pull_plan->Pull(stream, n)) { + return QueryHandlerResult::COMMIT; + } + return std::nullopt; + }, + RWType::NONE}; +} + PreparedQuery PrepareIndexQuery(ParsedQuery parsed_query, bool in_explicit_transaction, std::vector *notifications, InterpreterContext *interpreter_context) { if (in_explicit_transaction) { @@ -2504,7 +2637,7 @@ Interpreter::PrepareResult Interpreter::Prepare(const std::string &query_string, if (!in_explicit_transaction_ && (utils::Downcast(parsed_query.query) || utils::Downcast(parsed_query.query) || utils::Downcast(parsed_query.query) || utils::Downcast(parsed_query.query) || - utils::Downcast(parsed_query.query) || + utils::Downcast(parsed_query.query) || utils::Downcast(parsed_query.query) || utils::Downcast(parsed_query.query))) { db_accessor_ = std::make_unique(interpreter_context_->db->Access(GetIsolationLevelOverride())); @@ -2537,6 +2670,9 @@ Interpreter::PrepareResult Interpreter::Prepare(const std::string &query_string, } else if (utils::Downcast(parsed_query.query)) { prepared_query = PrepareIndexQuery(std::move(parsed_query), in_explicit_transaction_, &query_execution->notifications, interpreter_context_); + } else if (utils::Downcast(parsed_query.query)) { + prepared_query = PrepareAnalyzeGraphQuery(std::move(parsed_query), in_explicit_transaction_, + &*execution_db_accessor_, interpreter_context_); } else if (utils::Downcast(parsed_query.query)) { prepared_query = PrepareAuthQuery( std::move(parsed_query), in_explicit_transaction_, &query_execution->summary, interpreter_context_, diff --git a/src/query/interpreter.hpp b/src/query/interpreter.hpp index 132cdc790..fc9842df3 100644 --- a/src/query/interpreter.hpp +++ b/src/query/interpreter.hpp @@ -172,6 +172,24 @@ class ReplicationQueryHandler { virtual std::vector ShowReplicas() const = 0; }; +class AnalyzeGraphQueryHandler { + public: + AnalyzeGraphQueryHandler() = default; + virtual ~AnalyzeGraphQueryHandler() = default; + + AnalyzeGraphQueryHandler(const AnalyzeGraphQueryHandler &) = default; + AnalyzeGraphQueryHandler &operator=(const AnalyzeGraphQueryHandler &) = default; + + AnalyzeGraphQueryHandler(AnalyzeGraphQueryHandler &&) = default; + AnalyzeGraphQueryHandler &operator=(AnalyzeGraphQueryHandler &&) = default; + + static std::vector> AnalyzeGraphCreateStatistics(const std::span labels, + DbAccessor *execution_db_accessor); + + static std::vector> AnalyzeGraphDeleteStatistics(const std::span labels, + DbAccessor *execution_db_accessor); +}; + /** * A container for data related to the preparation of a query. */ diff --git a/src/query/plan/rewrite/index_lookup.hpp b/src/query/plan/rewrite/index_lookup.hpp index a6b3d5a9f..fcff3706f 100644 --- a/src/query/plan/rewrite/index_lookup.hpp +++ b/src/query/plan/rewrite/index_lookup.hpp @@ -27,6 +27,7 @@ #include "query/plan/operator.hpp" #include "query/plan/preprocess.hpp" +#include "storage/v2/indices.hpp" DECLARE_int64(query_vertex_count_to_expand_existing); @@ -482,6 +483,7 @@ class IndexLookupRewriter final : public HierarchicalLogicalOperatorVisitor { // FilterInfo with PropertyFilter. FilterInfo filter; int64_t vertex_count; + std::optional index_stats; }; bool DefaultPreVisit() override { throw utils::NotYetImplemented("optimizing index lookup"); } @@ -522,8 +524,11 @@ class IndexLookupRewriter final : public HierarchicalLogicalOperatorVisitor { return best_label; } - // Finds the label-property combination which has indexed the lowest amount of - // vertices. If the index cannot be found, nullopt is returned. + // Finds the label-property combination. The first criteria based on number of vertices indexed -> if one index has + // 10x less than the other one, always choose the smaller one. Otherwise, choose the index with smallest average group + // size based on key distribution. If average group size is equal, choose the index that has distribution closer to + // uniform distribution. Conditions based on average group size and key distribution can be only taken into account if + // the user has run `ANALYZE GRAPH` query before If the index cannot be found, nullopt is returned. std::optional FindBestLabelPropertyIndex(const Symbol &symbol, const std::unordered_set &bound_symbols) { auto are_bound = [&bound_symbols](const auto &used_symbols) { @@ -534,6 +539,27 @@ class IndexLookupRewriter final : public HierarchicalLogicalOperatorVisitor { } return true; }; + + /* + * Comparator function between two indices. If new index has >= 10x vertices than the existing, it cannot be better. + * If it is <= 10x in number of vertices, check average group size of property values. The index with smaller + * average group size is better. If the average group size is the same, choose the one closer to the uniform + * distribution + * @param found: Current best label-property index. + * @param new_stats: Label-property index candidate. + * @param vertex_count: New index's number of vertices. + * @return -1 if the new index is better, 0 if they are equal and 1 if the existing one is better. + */ + auto compare_indices = [](std::optional &found, std::optional &new_stats, + int vertex_count) { + if (!new_stats.has_value() || vertex_count / 10.0 > found->vertex_count) { + return 1; + } + int cmp_avg_group = utils::CompareDecimal(new_stats->avg_group_size, found->index_stats->avg_group_size); + if (cmp_avg_group != 0) return cmp_avg_group; + return utils::CompareDecimal(new_stats->statistic, found->index_stats->statistic); + }; + std::optional found; for (const auto &label : filters_.FilteredLabels(symbol)) { for (const auto &filter : filters_.PropertyFilters(symbol)) { @@ -548,7 +574,6 @@ class IndexLookupRewriter final : public HierarchicalLogicalOperatorVisitor { if (!db_->LabelPropertyIndexExists(GetLabel(label), GetProperty(property))) { continue; } - int64_t vertex_count = db_->VerticesCount(GetLabel(label), GetProperty(property)); auto is_better_type = [&found](PropertyFilter::Type type) { // Order the types by the most preferred index lookup type. static const PropertyFilter::Type kFilterTypeOrder[] = { @@ -557,17 +582,32 @@ class IndexLookupRewriter final : public HierarchicalLogicalOperatorVisitor { auto *type_sort_ix = std::find(kFilterTypeOrder, kFilterTypeOrder + 3, type); return type_sort_ix < found_sort_ix; }; - if (!found || vertex_count < found->vertex_count || - (vertex_count == found->vertex_count && is_better_type(filter.property_filter->type_))) { - found = LabelPropertyIndex{label, filter, vertex_count}; + + int64_t vertex_count = db_->VerticesCount(GetLabel(label), GetProperty(property)); + std::optional new_stats = db_->GetIndexStats(GetLabel(label), GetProperty(property)); + + // Conditions, from more to less important: + // the index with 10x less vertices is better. + // the index with smaller average group size is better. + // the index with equal avg group size and distribution closer to the uniform is better. + // the index with less vertices is better. + // the index with same number of vertices but more optimized filter is better. + if (!found || vertex_count * 10 < found->vertex_count) { + found = LabelPropertyIndex{label, filter, vertex_count, new_stats}; + continue; + } + + if (int cmp_res = compare_indices(found, new_stats, vertex_count); + cmp_res == -1 || + cmp_res == 0 && (found->vertex_count > vertex_count || + found->vertex_count == vertex_count && is_better_type(filter.property_filter->type_))) { + found = LabelPropertyIndex{label, filter, vertex_count, new_stats}; } } } return found; } - - // Creates a ScanAll by the best possible index for the `node_symbol`. Best - // index is defined as the index with least number of vertices. If the node + // Creates a ScanAll by the best possible index for the `node_symbol`. If the node // does not have at least a label, no indexed lookup can be created and // `nullptr` is returned. The operator is chained after `input`. Optional // `max_vertex_count` controls, whether no operator should be created if the diff --git a/src/query/plan/vertex_count_cache.hpp b/src/query/plan/vertex_count_cache.hpp index 2900ec372..f00070eaa 100644 --- a/src/query/plan/vertex_count_cache.hpp +++ b/src/query/plan/vertex_count_cache.hpp @@ -1,4 +1,4 @@ -// Copyright 2022 Memgraph Ltd. +// Copyright 2023 Memgraph Ltd. // // Use of this software is governed by the Business Source License // included in the file licenses/BSL.txt; by using this file, you agree to be bound by the terms of the Business Source @@ -78,6 +78,11 @@ class VertexCountCache { return db_->LabelPropertyIndexExists(label, property); } + std::optional GetIndexStats(const storage::LabelId &label, + const storage::PropertyId &property) const { + return db_->GetIndexStats(label, property); + } + private: typedef std::pair LabelPropertyKey; diff --git a/src/storage/v2/indices.cpp b/src/storage/v2/indices.cpp index fb83ff166..e02e0f9fc 100644 --- a/src/storage/v2/indices.cpp +++ b/src/storage/v2/indices.cpp @@ -1,4 +1,4 @@ -// Copyright 2022 Memgraph Ltd. +// Copyright 2023 Memgraph Ltd. // // Use of this software is governed by the Business Source License // included in the file licenses/BSL.txt; by using this file, you agree to be bound by the terms of the Business Source @@ -10,6 +10,8 @@ // licenses/APL.txt. #include "indices.hpp" +#include +#include #include #include "storage/v2/mvcc.hpp" @@ -688,6 +690,45 @@ int64_t LabelPropertyIndex::ApproximateVertexCount(LabelId label, PropertyId pro return acc.estimate_range_count(lower, upper, utils::SkipListLayerForCountEstimation(acc.size())); } +/* +Iterate over all property-label pairs and deletes if label from the index is equal to label parameter. +*/ +std::vector> LabelPropertyIndex::DeleteIndexStatsForLabel( + const storage::LabelId &label) { + std::vector> deleted_indexes; + for (auto it = stats_.cbegin(); it != stats_.cend();) { + if (it->first.first == label) { + deleted_indexes.push_back(it->first); + it = stats_.erase(it); + } else { + ++it; + } + } + return deleted_indexes; +} + +std::vector> LabelPropertyIndex::ClearIndexStats() { + std::vector> deleted_indexes; + deleted_indexes.reserve(stats_.size()); + std::transform(stats_.begin(), stats_.end(), std::back_inserter(deleted_indexes), + [](const auto &elem) { return elem.first; }); + stats_.clear(); + return deleted_indexes; +} + +void LabelPropertyIndex::SetIndexStats(const storage::LabelId &label, const storage::PropertyId &property, + const IndexStats &stats) { + stats_[{label, property}] = stats; +} + +std::optional LabelPropertyIndex::GetIndexStats(const storage::LabelId &label, + const storage::PropertyId &property) const { + if (auto it = stats_.find({label, property}); it != stats_.end()) { + return it->second; + } + return {}; +} + void LabelPropertyIndex::RunGC() { for (auto &index_entry : index_) { index_entry.second.run_gc(); diff --git a/src/storage/v2/indices.hpp b/src/storage/v2/indices.hpp index eed22e8b5..dfb990e82 100644 --- a/src/storage/v2/indices.hpp +++ b/src/storage/v2/indices.hpp @@ -1,4 +1,4 @@ -// Copyright 2022 Memgraph Ltd. +// Copyright 2023 Memgraph Ltd. // // Use of this software is governed by the Business Source License // included in the file licenses/BSL.txt; by using this file, you agree to be bound by the terms of the Business Source @@ -131,6 +131,10 @@ class LabelIndex { Config::Items config_; }; +struct IndexStats { + double statistic, avg_group_size; +}; + class LabelPropertyIndex { private: struct Entry { @@ -237,12 +241,23 @@ class LabelPropertyIndex { const std::optional> &lower, const std::optional> &upper) const; + std::vector> ClearIndexStats(); + + std::vector> DeleteIndexStatsForLabel(const storage::LabelId &label); + + void SetIndexStats(const storage::LabelId &label, const storage::PropertyId &property, + const storage::IndexStats &stats); + + std::optional GetIndexStats(const storage::LabelId &label, + const storage::PropertyId &property) const; + void Clear() { index_.clear(); } void RunGC(); private: std::map, utils::SkipList> index_; + std::map, storage::IndexStats> stats_; Indices *indices_; Constraints *constraints_; Config::Items config_; diff --git a/src/storage/v2/replication/replication_client.cpp b/src/storage/v2/replication/replication_client.cpp index 28afb9e12..e458a0f4d 100644 --- a/src/storage/v2/replication/replication_client.cpp +++ b/src/storage/v2/replication/replication_client.cpp @@ -1,4 +1,4 @@ -// Copyright 2022 Memgraph Ltd. +// Copyright 2023 Memgraph Ltd. // // Use of this software is governed by the Business Source License // included in the file licenses/BSL.txt; by using this file, you agree to be bound by the terms of the Business Source diff --git a/src/storage/v2/storage.hpp b/src/storage/v2/storage.hpp index 407b0c090..925d81c86 100644 --- a/src/storage/v2/storage.hpp +++ b/src/storage/v2/storage.hpp @@ -16,6 +16,7 @@ #include #include #include +#include #include #include "io/network/endpoint.hpp" @@ -265,6 +266,30 @@ class Storage final { return storage_->indices_.label_property_index.ApproximateVertexCount(label, property, lower, upper); } + std::optional GetIndexStats(const storage::LabelId &label, + const storage::PropertyId &property) const { + return storage_->indices_.label_property_index.GetIndexStats(label, property); + } + + std::vector> ClearIndexStats() { + return storage_->indices_.label_property_index.ClearIndexStats(); + } + + std::vector> DeleteIndexStatsForLabels(const std::span labels) { + std::vector> deleted_indexes; + std::for_each(labels.begin(), labels.end(), [this, &deleted_indexes](const auto &label_str) { + std::vector> loc_results = + storage_->indices_.label_property_index.DeleteIndexStatsForLabel(NameToLabel(label_str)); + deleted_indexes.insert(deleted_indexes.end(), std::make_move_iterator(loc_results.begin()), + std::make_move_iterator(loc_results.end())); + }); + return deleted_indexes; + } + + void SetIndexStats(const storage::LabelId &label, const storage::PropertyId &property, const IndexStats &stats) { + storage_->indices_.label_property_index.SetIndexStats(label, property, stats); + } + /// @return Accessor to the deleted vertex if a deletion took place, std::nullopt otherwise /// @throw std::bad_alloc Result> DeleteVertex(VertexAccessor *vertex); diff --git a/src/utils/math.hpp b/src/utils/math.hpp index eb6e645ab..ab0e7a05d 100644 --- a/src/utils/math.hpp +++ b/src/utils/math.hpp @@ -1,4 +1,4 @@ -// Copyright 2022 Memgraph Ltd. +// Copyright 2023 Memgraph Ltd. // // Use of this software is governed by the Business Source License // included in the file licenses/BSL.txt; by using this file, you agree to be bound by the terms of the Business Source @@ -11,11 +11,15 @@ #pragma once +#include +#include #include #include #include #include +#include + namespace memgraph::utils { static_assert(std::is_same_v, @@ -64,4 +68,36 @@ constexpr std::optional RoundUint64ToMultiple(uint64_t val, uint64_t m return (numerator / multiple) * multiple; } +template +concept FloatingPoint = std::is_floating_point_v; + +template +bool ApproxEqualDecimal(T a, T b) { + return boost::math::relative_difference(a, b) < std::numeric_limits::epsilon(); +} + +template +bool LessThanDecimal(T a, T b) { + return (b - a) > std::numeric_limits::epsilon(); +} + +/* + * return 0 if a == b + * return 1 if a > b + * return -1 if a < b + */ +template +int CompareDecimal(T a, T b) { + if (ApproxEqualDecimal(a, b)) return 0; + if (LessThanDecimal(a, b)) return -1; + return 1; +} + +constexpr double ChiSquaredValue(double observed, double expected) { + if (utils::ApproxEqualDecimal(expected, 0.0)) { + return std::numeric_limits::max(); + } + return (observed - expected) * (observed - expected) / expected; +} + } // namespace memgraph::utils diff --git a/src/utils/typeinfo.hpp b/src/utils/typeinfo.hpp index bad53f9c6..7230213a5 100644 --- a/src/utils/typeinfo.hpp +++ b/src/utils/typeinfo.hpp @@ -176,9 +176,9 @@ enum class TypeId : uint64_t { AST_VERSION_QUERY, AST_FOREACH, AST_SHOW_CONFIG_QUERY, + AST_ANALYZE_GRAPH_QUERY, AST_TRANSACTION_QUEUE_QUERY, AST_EXISTS, - // Symbol SYMBOL, }; diff --git a/tests/e2e/CMakeLists.txt b/tests/e2e/CMakeLists.txt index bb7b6839e..e9da08919 100644 --- a/tests/e2e/CMakeLists.txt +++ b/tests/e2e/CMakeLists.txt @@ -44,6 +44,7 @@ add_subdirectory(module_file_manager) add_subdirectory(monitoring_server) add_subdirectory(lba_procedures) add_subdirectory(python_query_modules_reloading) +add_subdirectory(analyze_graph) add_subdirectory(transaction_queue) add_subdirectory(mock_api) diff --git a/tests/e2e/analyze_graph/CMakeLists.txt b/tests/e2e/analyze_graph/CMakeLists.txt new file mode 100644 index 000000000..1b96eb960 --- /dev/null +++ b/tests/e2e/analyze_graph/CMakeLists.txt @@ -0,0 +1,6 @@ +function(copy_analyze_graph_e2e_python_files FILE_NAME) + copy_e2e_python_files(analyze_graph ${FILE_NAME}) +endfunction() + +copy_analyze_graph_e2e_python_files(common.py) +copy_analyze_graph_e2e_python_files(optimize_indexes.py) diff --git a/tests/e2e/analyze_graph/common.py b/tests/e2e/analyze_graph/common.py new file mode 100644 index 000000000..d8b0cabf7 --- /dev/null +++ b/tests/e2e/analyze_graph/common.py @@ -0,0 +1,29 @@ +# Copyright 2023 Memgraph Ltd. +# +# Use of this software is governed by the Business Source License +# included in the file licenses/BSL.txt; by using this file, you agree to be bound by the terms of the Business Source +# License, and you may not use this file except in compliance with the Business Source License. +# +# As of the Change Date specified in that file, in accordance with +# the Business Source License, use of this software will be governed +# by the Apache License, Version 2.0, included in the file +# licenses/APL.txt. + +import typing + +import mgclient +import pytest + + +def execute_and_fetch_all(cursor: mgclient.Cursor, query: str, params: dict = {}) -> typing.List[tuple]: + cursor.execute(query, params) + return cursor.fetchall() + + +@pytest.fixture +def connect(**kwargs) -> mgclient.Connection: + connection = mgclient.connect(host="localhost", port=7687, **kwargs) + connection.autocommit = True + yield connection + cursor = connection.cursor() + execute_and_fetch_all(cursor, "MATCH (n) DETACH DELETE n") diff --git a/tests/e2e/analyze_graph/optimize_indexes.py b/tests/e2e/analyze_graph/optimize_indexes.py new file mode 100644 index 000000000..82290c673 --- /dev/null +++ b/tests/e2e/analyze_graph/optimize_indexes.py @@ -0,0 +1,282 @@ +# Copyright 2023 Memgraph Ltd. +# +# Use of this software is governed by the Business Source License +# included in the file licenses/BSL.txt; by using this file, you agree to be bound by the terms of the Business Source +# License, and you may not use this file except in compliance with the Business Source License. +# +# As of the Change Date specified in that file, in accordance with +# the Business Source License, use of this software will be governed +# by the Apache License, Version 2.0, included in the file +# licenses/APL.txt. + +import sys + +import pytest +from common import connect, execute_and_fetch_all + +# E2E tests for checking query semantic +# ------------------------------------ + + +@pytest.mark.parametrize( + "delete_query", + [ + "ANALYZE GRAPH DELETE STATISTICS", + "ANALYZE GRAPH ON LABELS * DELETE STATISTICS", + "ANALYZE GRAPH ON LABELS :Label DELETE STATISTICS", + "ANALYZE GRAPH ON LABELS :Label, :NONEXISTING DELETE STATISTICS", + ], +) +def test_analyze_graph_delete_statistics(delete_query, connect): + """Tests that all variants of delete queries work as expected.""" + cursor = connect.cursor() + execute_and_fetch_all(cursor, "FOREACH (i IN range(1, 100) | CREATE (n:Label {id1: i}));") + execute_and_fetch_all(cursor, "FOREACH (i IN range(1, 50) | CREATE (n:Label {id2: i % 5}));") + execute_and_fetch_all(cursor, "CREATE INDEX ON :Label(id1);") + execute_and_fetch_all(cursor, "CREATE INDEX ON :Label(id2);") + analyze_graph_results = execute_and_fetch_all(cursor, "ANALYZE GRAPH") + assert len(analyze_graph_results) == 2 + delete_stats_results = execute_and_fetch_all(cursor, delete_query) + assert len(delete_stats_results) == 2 + if delete_stats_results[0][1] == "id1": + first_index = 0 + else: + first_index = 1 + assert delete_stats_results[first_index] == ("Label", "id1") + assert delete_stats_results[1 - first_index] == ("Label", "id2") + # After deleting statistics, id2 should be chosen because it has less vertices + expected_explain_after_delete_analysis = [ + (f" * Produce {{n}}",), + (f" * Filter",), + (f" * ScanAllByLabelPropertyValue (n :Label {{id2}})",), + (f" * Once",), + ] + assert ( + execute_and_fetch_all(cursor, "EXPLAIN MATCH (n:Label) WHERE n.id2 = 3 AND n.id1 = 3 RETURN n;") + == expected_explain_after_delete_analysis + ) + execute_and_fetch_all(cursor, "DROP INDEX ON :Label(id1);") + execute_and_fetch_all(cursor, "DROP INDEX ON :Label(id2);") + + +@pytest.mark.parametrize( + "analyze_query", + [ + "ANALYZE GRAPH", + "ANALYZE GRAPH ON LABELS *", + "ANALYZE GRAPH ON LABELS :Label", + "ANALYZE GRAPH ON LABELS :Label, :NONEXISTING", + ], +) +def test_analyze_full_graph(analyze_query, connect): + """Tests analyzing full graph and choosing better index based on the smaller average group size. + It also tests querying based on labels and that nothing bad will happen by providing non-existing label. + """ + cursor = connect.cursor() + execute_and_fetch_all(cursor, "FOREACH (i IN range(1, 100) | CREATE (n:Label {id1: i}));") + execute_and_fetch_all(cursor, "FOREACH (i IN range(1, 50) | CREATE (n:Label {id2: i % 5}));") + execute_and_fetch_all(cursor, "CREATE INDEX ON :Label(id1);") + execute_and_fetch_all(cursor, "CREATE INDEX ON :Label(id2);") + # Choose id2 before tha analysis because it has less vertices + expected_explain_before_analysis = [ + (f" * Produce {{n}}",), + (f" * Filter",), + (f" * ScanAllByLabelPropertyValue (n :Label {{id2}})",), + (f" * Once",), + ] + assert ( + execute_and_fetch_all(cursor, "EXPLAIN MATCH (n:Label) WHERE n.id2 = 3 AND n.id1 = 3 RETURN n;") + == expected_explain_before_analysis + ) + # Run analyze query + analyze_graph_results = execute_and_fetch_all(cursor, analyze_query) + assert len(analyze_graph_results) == 2 + if analyze_graph_results[0][1] == "id1": + first_index = 0 + else: + first_index = 1 + # Check results + assert analyze_graph_results[first_index] == ("Label", "id1", 100, 100, 1, 0) + assert analyze_graph_results[1 - first_index] == ("Label", "id2", 50, 5, 10, 0) + # After analyzing graph, id1 index should be chosen because it has smaller average group size + expected_explain_after_analysis = [ + (f" * Produce {{n}}",), + (f" * Filter",), + (f" * ScanAllByLabelPropertyValue (n :Label {{id1}})",), + (f" * Once",), + ] + assert ( + execute_and_fetch_all(cursor, "EXPLAIN MATCH (n:Label) WHERE n.id2 = 3 AND n.id1 = 3 RETURN n;") + == expected_explain_after_analysis + ) + assert len(execute_and_fetch_all(cursor, "ANALYZE GRAPH DELETE STATISTICS")) == 2 + execute_and_fetch_all(cursor, "DROP INDEX ON :Label(id1);") + execute_and_fetch_all(cursor, "DROP INDEX ON :Label(id2);") + + +# Explicit index choosing tests +# ----------------------------- + + +def test_cardinality_different_avg_group_size_uniform_dist(connect): + """Tests index optimization with indices both having uniform distribution but one has smaller avg. group size.""" + cursor = connect.cursor() + execute_and_fetch_all(cursor, "FOREACH (i IN range(1, 100) | CREATE (n:Label {id1: i}));") + execute_and_fetch_all(cursor, "FOREACH (i IN range(1, 100) | CREATE (n:Label {id2: i % 20}));") + execute_and_fetch_all(cursor, "CREATE INDEX ON :Label(id1);") + execute_and_fetch_all(cursor, "CREATE INDEX ON :Label(id2);") + analyze_graph_results = execute_and_fetch_all(cursor, "ANALYZE GRAPH") + if analyze_graph_results[0][1] == "id1": + first_index = 0 + else: + first_index = 1 + # Check results + assert analyze_graph_results[first_index] == ("Label", "id1", 100, 100, 1, 0) + assert analyze_graph_results[1 - first_index] == ("Label", "id2", 100, 20, 5, 0) + expected_explain_after_analysis = [ + (f" * Produce {{n}}",), + (f" * Filter",), + (f" * ScanAllByLabelPropertyValue (n :Label {{id1}})",), + (f" * Once",), + ] + assert ( + execute_and_fetch_all(cursor, "EXPLAIN MATCH (n:Label) WHERE n.id2 = 3 AND n.id1 = 3 RETURN n;") + == expected_explain_after_analysis + ) + assert len(execute_and_fetch_all(cursor, "ANALYZE GRAPH DELETE STATISTICS")) == 2 + execute_and_fetch_all(cursor, "DROP INDEX ON :Label(id1);") + execute_and_fetch_all(cursor, "DROP INDEX ON :Label(id2);") + + +def test_cardinality_same_avg_group_size_uniform_dist_diff_vertex_count(connect): + """Tests index choosing where both indices have uniform key distribution with same avg. group size but one has less vertices.""" + cursor = connect.cursor() + execute_and_fetch_all(cursor, "FOREACH (i IN range(1, 100) | CREATE (n:Label {id1: i}));") + execute_and_fetch_all(cursor, "FOREACH (i IN range(1, 50) | CREATE (n:Label {id2: i}));") + execute_and_fetch_all(cursor, "CREATE INDEX ON :Label(id1);") + execute_and_fetch_all(cursor, "CREATE INDEX ON :Label(id2);") + analyze_graph_results = execute_and_fetch_all(cursor, "ANALYZE GRAPH") + if analyze_graph_results[0][1] == "id1": + first_index = 0 + else: + first_index = 1 + # Check results + assert analyze_graph_results[first_index] == ("Label", "id1", 100, 100, 1, 0) + assert analyze_graph_results[1 - first_index] == ("Label", "id2", 50, 50, 1, 0) + expected_explain_after_analysis = [ + (f" * Produce {{n}}",), + (f" * Filter",), + (f" * ScanAllByLabelPropertyValue (n :Label {{id2}})",), + (f" * Once",), + ] + assert ( + execute_and_fetch_all(cursor, "EXPLAIN MATCH (n:Label) WHERE n.id2 = 3 AND n.id1 = 3 RETURN n;") + == expected_explain_after_analysis + ) + assert len(execute_and_fetch_all(cursor, "ANALYZE GRAPH DELETE STATISTICS")) == 2 + execute_and_fetch_all(cursor, "DROP INDEX ON :Label(id1);") + execute_and_fetch_all(cursor, "DROP INDEX ON :Label(id2);") + + +def test_large_diff_in_num_vertices_v1(connect): + """Tests that when one index has > 10x vertices than the other one, it should be chosen no matter avg group size and uniform distribution.""" + cursor = connect.cursor() + execute_and_fetch_all(cursor, "FOREACH (i IN range(1, 1000) | CREATE (n:Label {id1: i}));") + execute_and_fetch_all(cursor, "FOREACH (i IN range(1, 99) | CREATE (n:Label {id2: 1}));") + execute_and_fetch_all(cursor, "CREATE INDEX ON :Label(id1);") + execute_and_fetch_all(cursor, "CREATE INDEX ON :Label(id2);") + analyze_graph_results = execute_and_fetch_all(cursor, "ANALYZE GRAPH") + if analyze_graph_results[0][1] == "id1": + first_index = 0 + else: + first_index = 1 + # Check results + assert analyze_graph_results[first_index] == ("Label", "id1", 1000, 1000, 1, 0) + assert analyze_graph_results[1 - first_index] == ("Label", "id2", 99, 1, 99, 0) + expected_explain_after_analysis = [ + (f" * Produce {{n}}",), + (f" * Filter",), + (f" * ScanAllByLabelPropertyValue (n :Label {{id2}})",), + (f" * Once",), + ] + assert ( + execute_and_fetch_all(cursor, "EXPLAIN MATCH (n:Label) WHERE n.id2 = 3 AND n.id1 = 3 RETURN n;") + == expected_explain_after_analysis + ) + assert len(execute_and_fetch_all(cursor, "ANALYZE GRAPH DELETE STATISTICS")) == 2 + execute_and_fetch_all(cursor, "DROP INDEX ON :Label(id1);") + execute_and_fetch_all(cursor, "DROP INDEX ON :Label(id2);") + + +def test_large_diff_in_num_vertices_v2(connect): + """Tests that when one index has > 10x vertices than the other one, it should be chosen no matter avg group size and uniform distribution.""" + cursor = connect.cursor() + execute_and_fetch_all(cursor, "FOREACH (i IN range(1, 99) | CREATE (n:Label {id1: 1}));") + execute_and_fetch_all(cursor, "FOREACH (i IN range(1, 1000) | CREATE (n:Label {id2: i}));") + execute_and_fetch_all(cursor, "CREATE INDEX ON :Label(id1);") + execute_and_fetch_all(cursor, "CREATE INDEX ON :Label(id2);") + analyze_graph_results = execute_and_fetch_all(cursor, "ANALYZE GRAPH") + if analyze_graph_results[0][1] == "id1": + first_index = 0 + else: + first_index = 1 + # Check results + assert analyze_graph_results[first_index] == ("Label", "id1", 99, 1, 99, 0) + assert analyze_graph_results[1 - first_index] == ("Label", "id2", 1000, 1000, 1, 0) + expected_explain_after_analysis = [ + (f" * Produce {{n}}",), + (f" * Filter",), + (f" * ScanAllByLabelPropertyValue (n :Label {{id1}})",), + (f" * Once",), + ] + assert ( + execute_and_fetch_all(cursor, "EXPLAIN MATCH (n:Label) WHERE n.id2 = 3 AND n.id1 = 3 RETURN n;") + == expected_explain_after_analysis + ) + assert len(execute_and_fetch_all(cursor, "ANALYZE GRAPH DELETE STATISTICS")) == 2 + execute_and_fetch_all(cursor, "DROP INDEX ON :Label(id1);") + execute_and_fetch_all(cursor, "DROP INDEX ON :Label(id2);") + + +def test_same_avg_group_size_diff_distribution(connect): + """Tests index choice decision based on key distribution.""" + cursor = connect.cursor() + # Setup first key distribution + execute_and_fetch_all(cursor, "FOREACH (i IN range(1, 10) | CREATE (n:Label {id1: 1}));") + execute_and_fetch_all(cursor, "FOREACH (i IN range(1, 30) | CREATE (n:Label {id1: 2}));") + execute_and_fetch_all(cursor, "FOREACH (i IN range(1, 20) | CREATE (n:Label {id1: 3}));") + execute_and_fetch_all(cursor, "FOREACH (i IN range(1, 35) | CREATE (n:Label {id1: 4}));") + execute_and_fetch_all(cursor, "FOREACH (i IN range(1, 5) | CREATE (n:Label {id1: 5}));") + # Setup second key distribution + execute_and_fetch_all(cursor, "FOREACH (i IN range(1, 20) | CREATE (n:Label {id2: 1}));") + execute_and_fetch_all(cursor, "FOREACH (i IN range(1, 20) | CREATE (n:Label {id2: 2}));") + execute_and_fetch_all(cursor, "FOREACH (i IN range(1, 20) | CREATE (n:Label {id2: 3}));") + execute_and_fetch_all(cursor, "FOREACH (i IN range(1, 20) | CREATE (n:Label {id2: 4}));") + execute_and_fetch_all(cursor, "FOREACH (i IN range(1, 20) | CREATE (n:Label {id2: 5}));") + execute_and_fetch_all(cursor, "CREATE INDEX ON :Label(id1);") + execute_and_fetch_all(cursor, "CREATE INDEX ON :Label(id2);") + analyze_graph_results = execute_and_fetch_all(cursor, "ANALYZE GRAPH") + if analyze_graph_results[0][1] == "id1": + first_index = 0 + else: + first_index = 1 + # Check results + assert analyze_graph_results[first_index] == ("Label", "id1", 100, 5, 20, 32.5) + assert analyze_graph_results[1 - first_index] == ("Label", "id2", 100, 5, 20, 0) + expected_explain_after_analysis = [ + (f" * Produce {{n}}",), + (f" * Filter",), + (f" * ScanAllByLabelPropertyValue (n :Label {{id2}})",), + (f" * Once",), + ] + assert ( + execute_and_fetch_all(cursor, "EXPLAIN MATCH (n:Label) WHERE n.id2 = 3 AND n.id1 = 3 RETURN n;") + == expected_explain_after_analysis + ) + assert len(execute_and_fetch_all(cursor, "ANALYZE GRAPH DELETE STATISTICS")) == 2 + execute_and_fetch_all(cursor, "DROP INDEX ON :Label(id1);") + execute_and_fetch_all(cursor, "DROP INDEX ON :Label(id2);") + + +if __name__ == "__main__": + sys.exit(pytest.main([__file__, "-rA"])) diff --git a/tests/e2e/analyze_graph/workloads.yaml b/tests/e2e/analyze_graph/workloads.yaml new file mode 100644 index 000000000..68e5482ee --- /dev/null +++ b/tests/e2e/analyze_graph/workloads.yaml @@ -0,0 +1,14 @@ +analyze_graph_cluster: &analyze_graph_cluster + cluster: + main: + args: ["--bolt-port", "7687", "--log-level=TRACE"] + log_file: "analyze_graph.log" + setup_queries: [] + validation_queries: [] + + +workloads: + - name: "Analyze graph for better indexing" + binary: "tests/e2e/pytest_runner.sh" + args: ["analyze_graph/optimize_indexes.py"] + <<: *analyze_graph_cluster diff --git a/tests/manual/interactive_planning.cpp b/tests/manual/interactive_planning.cpp index 09566e11b..fc8b6598e 100644 --- a/tests/manual/interactive_planning.cpp +++ b/tests/manual/interactive_planning.cpp @@ -1,4 +1,4 @@ -// Copyright 2022 Memgraph Ltd. +// Copyright 2023 Memgraph Ltd. // // Use of this software is governed by the Business Source License // included in the file licenses/BSL.txt; by using this file, you agree to be bound by the terms of the Business Source @@ -213,6 +213,11 @@ class InteractiveDbAccessor { return label_property_index_.at(key); } + std::optional GetIndexStats(memgraph::storage::LabelId label, + memgraph::storage::PropertyId property) const { + return dba_->GetIndexStats(label, property); + } + // Save the cached vertex counts to a stream. void Save(std::ostream &out) { out << "vertex-count " << vertices_count_ << std::endl; diff --git a/tests/unit/query_plan_checker.hpp b/tests/unit/query_plan_checker.hpp index e48a27b29..15aecd030 100644 --- a/tests/unit/query_plan_checker.hpp +++ b/tests/unit/query_plan_checker.hpp @@ -11,6 +11,7 @@ #include #include +#include #include "query/frontend/semantic/symbol_generator.hpp" #include "query/frontend/semantic/symbol_table.hpp" @@ -459,6 +460,11 @@ class FakeDbAccessor { return false; } + memgraph::storage::IndexStats GetIndexStats(memgraph::storage::LabelId label, + memgraph::storage::PropertyId property) const { + return memgraph::storage::IndexStats{.statistic = 0, .avg_group_size = 1}; // unique id + } + void SetIndexCount(memgraph::storage::LabelId label, int64_t count) { label_index_[label] = count; } void SetIndexCount(memgraph::storage::LabelId label, memgraph::storage::PropertyId property, int64_t count) { diff --git a/tests/unit/utils_math.cpp b/tests/unit/utils_math.cpp index 0435d1083..e74c09f85 100644 --- a/tests/unit/utils_math.cpp +++ b/tests/unit/utils_math.cpp @@ -1,4 +1,4 @@ -// Copyright 2022 Memgraph Ltd. +// Copyright 2023 Memgraph Ltd. // // Use of this software is governed by the Business Source License // included in the file licenses/BSL.txt; by using this file, you agree to be bound by the terms of the Business Source @@ -10,6 +10,7 @@ // licenses/APL.txt. #include +#include #include #include @@ -21,3 +22,29 @@ TEST(UtilsMath, Log2) { ASSERT_EQ(memgraph::utils::Log2(i), static_cast(log2(i))); } } + +TEST(UtilsMath, EqualFloat) { + ASSERT_TRUE(memgraph::utils::ApproxEqualDecimal(0.2f, 0.2f)); + ASSERT_TRUE(memgraph::utils::ApproxEqualDecimal(0.2f, 0.199999999999f)); + ASSERT_TRUE(memgraph::utils::ApproxEqualDecimal(0.2f, 0.200000000001f)); + ASSERT_FALSE(memgraph::utils::ApproxEqualDecimal(0.2f, 0.19995f)); +} + +TEST(UtilsMath, EqualDouble) { + ASSERT_TRUE(memgraph::utils::ApproxEqualDecimal(0.2, 0.2)); + ASSERT_TRUE(memgraph::utils::ApproxEqualDecimal(0.2, 0.19999999999999999999)); + ASSERT_TRUE(memgraph::utils::ApproxEqualDecimal(0.2, 0.20000000000000000001)); + ASSERT_FALSE(memgraph::utils::ApproxEqualDecimal(0.2, 0.19995)); +} + +TEST(UtilsMath, LessThan) { + ASSERT_TRUE(memgraph::utils::LessThanDecimal(0.2, 0.3)); + ASSERT_TRUE(memgraph::utils::LessThanDecimal(0.2, 0.20001)); +} + +TEST(UtilsMath, ChiSquared) { + ASSERT_EQ(std::numeric_limits::max(), memgraph::utils::ChiSquaredValue(2.0, 0.0)); + ASSERT_DOUBLE_EQ(0.0, memgraph::utils::ChiSquaredValue(2.0, 2.0)); + ASSERT_DOUBLE_EQ(1.0, memgraph::utils::ChiSquaredValue(2.0, 1.0)); + ASSERT_DOUBLE_EQ(1. / 3., memgraph::utils::ChiSquaredValue(4.0, 3.0)); +}