Add index statistics for better query planning (#812)

This commit is contained in:
Josipmrden 2023-03-30 15:34:34 +02:00 committed by GitHub
parent 0819b40202
commit 398503da7a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
34 changed files with 852 additions and 53 deletions

View File

@ -16,6 +16,7 @@
#include "auth/crypto.hpp"
#include "auth/exceptions.hpp"
#include "license/license.hpp"
#include "query/constants.hpp"
#include "utils/cast.hpp"
#include "utils/logging.hpp"
#include "utils/settings.hpp"
@ -270,7 +271,7 @@ PermissionLevel FineGrainedAccessPermissions::Has(const std::string &permission,
void FineGrainedAccessPermissions::Grant(const std::string &permission,
const FineGrainedPermission fine_grained_permission) {
if (permission == kAsterisk) {
if (permission == query::kAsterisk) {
global_permission_ = CalculateGrant(fine_grained_permission);
} else {
permissions_[permission] = CalculateGrant(fine_grained_permission);
@ -278,7 +279,7 @@ void FineGrainedAccessPermissions::Grant(const std::string &permission,
}
void FineGrainedAccessPermissions::Revoke(const std::string &permission) {
if (permission == kAsterisk) {
if (permission == query::kAsterisk) {
permissions_.clear();
global_permission_ = std::nullopt;
} else {

View File

@ -15,7 +15,6 @@
#include <json/json.hpp>
namespace memgraph::auth {
const std::string kAsterisk = "*";
// These permissions must have values that are applicable for usage in a
// bitmask.
// clang-format off

View File

@ -15,6 +15,7 @@
#include "auth/models.hpp"
#include "glue/auth.hpp"
#include "license/license.hpp"
#include "query/constants.hpp"
#include "query/frontend/ast/ast.hpp"
#include "utils/synchronized.hpp"
@ -38,7 +39,7 @@ bool IsUserAuthorizedGloballyLabels(const memgraph::auth::User &user,
if (!memgraph::license::global_license_checker.IsEnterpriseValidFast()) {
return true;
}
return user.GetFineGrainedAccessLabelPermissions().Has(memgraph::auth::kAsterisk, fine_grained_permission) ==
return user.GetFineGrainedAccessLabelPermissions().Has(memgraph::query::kAsterisk, fine_grained_permission) ==
memgraph::auth::PermissionLevel::GRANT;
}
@ -47,7 +48,7 @@ bool IsUserAuthorizedGloballyEdges(const memgraph::auth::User &user,
if (!memgraph::license::global_license_checker.IsEnterpriseValidFast()) {
return true;
}
return user.GetFineGrainedAccessEdgeTypePermissions().Has(memgraph::auth::kAsterisk, fine_grained_permission) ==
return user.GetFineGrainedAccessEdgeTypePermissions().Has(memgraph::query::kAsterisk, fine_grained_permission) ==
memgraph::auth::PermissionLevel::GRANT;
}

View File

@ -1,4 +1,4 @@
// Copyright 2022 Memgraph Ltd.
// Copyright 2023 Memgraph Ltd.
//
// Use of this software is governed by the Business Source License
// included in the file licenses/BSL.txt; by using this file, you agree to be bound by the terms of the Business Source
@ -18,6 +18,7 @@
#include "auth/models.hpp"
#include "glue/auth.hpp"
#include "license/license.hpp"
#include "query/constants.hpp"
namespace {
@ -253,19 +254,18 @@ bool AuthQueryHandler::CreateUser(const std::string &username, const std::option
if (first_user) {
spdlog::info("{} is first created user. Granting all privileges.", username);
GrantPrivilege(username, memgraph::query::kPrivilegesAll
GrantPrivilege(
username, memgraph::query::kPrivilegesAll
#ifdef MG_ENTERPRISE
,
{{{memgraph::query::AuthQuery::FineGrainedPrivilege::CREATE_DELETE, {memgraph::auth::kAsterisk}}}},
{
{
{
memgraph::query::AuthQuery::FineGrainedPrivilege::CREATE_DELETE, {
memgraph::auth::kAsterisk
}
}
}
}
,
{{{memgraph::query::AuthQuery::FineGrainedPrivilege::CREATE_DELETE, {memgraph::query::kAsterisk}}}},
{
{
{
memgraph::query::AuthQuery::FineGrainedPrivilege::CREATE_DELETE, { memgraph::query::kAsterisk }
}
}
}
#endif
);
}

View File

@ -1,4 +1,4 @@
// Copyright 2022 Memgraph Ltd.
// Copyright 2023 Memgraph Ltd.
//
// Use of this software is governed by the Business Source License
// included in the file licenses/BSL.txt; by using this file, you agree to be bound by the terms of the Business Source
@ -16,4 +16,6 @@
namespace memgraph::query {
inline constexpr uint16_t kDefaultReplicationPort = 10000;
inline constexpr auto *kDefaultReplicationServerIp = "0.0.0.0";
inline const std::string kAsterisk = "*";
inline constexpr uint16_t kDeleteStatisticsNumResults = 6;
} // namespace memgraph::query

View File

@ -430,6 +430,25 @@ class DbAccessor final {
return accessor_->LabelPropertyIndexExists(label, prop);
}
std::optional<storage::IndexStats> GetIndexStats(const storage::LabelId &label,
const storage::PropertyId &property) const {
return accessor_->GetIndexStats(label, property);
}
std::vector<std::pair<storage::LabelId, storage::PropertyId>> ClearIndexStats() {
return accessor_->ClearIndexStats();
}
std::vector<std::pair<storage::LabelId, storage::PropertyId>> DeleteIndexStatsForLabels(
const std::span<std::string> labels) {
return accessor_->DeleteIndexStatsForLabels(labels);
}
void SetIndexStats(const storage::LabelId &label, const storage::PropertyId &property,
const storage::IndexStats &stats) {
accessor_->SetIndexStats(label, property, stats);
}
int64_t VerticesCount() const { return accessor_->ApproximateVertexCount(); }
int64_t VerticesCount(storage::LabelId label) const { return accessor_->ApproximateVertexCount(label); }

View File

@ -229,6 +229,12 @@ class VersionInfoInMulticommandTxException : public QueryException {
: QueryException("Version info query not allowed in multicommand transactions.") {}
};
class AnalyzeGraphInMulticommandTxException : public QueryException {
public:
AnalyzeGraphInMulticommandTxException()
: QueryException("Analyze graph query not allowed in multicommand transactions.") {}
};
class ReplicationException : public utils::BasicException {
public:
using utils::BasicException::BasicException;

View File

@ -260,6 +260,9 @@ constexpr utils::TypeInfo query::Foreach::kType{utils::TypeId::AST_FOREACH, "For
constexpr utils::TypeInfo query::ShowConfigQuery::kType{utils::TypeId::AST_SHOW_CONFIG_QUERY, "ShowConfigQuery",
&query::Query::kType};
constexpr utils::TypeInfo query::AnalyzeGraphQuery::kType{utils::TypeId::AST_ANALYZE_GRAPH_QUERY, "AnalyzeGraphQuery",
&query::Query::kType};
constexpr utils::TypeInfo query::TransactionQueueQuery::kType{utils::TypeId::AST_TRANSACTION_QUEUE_QUERY,
"TransactionQueueQuery", &query::Query::kType};

View File

@ -3230,6 +3230,26 @@ class TransactionQueueQuery : public memgraph::query::Query {
}
};
class AnalyzeGraphQuery : public memgraph::query::Query {
public:
static const utils::TypeInfo kType;
const utils::TypeInfo &GetTypeInfo() const override { return kType; }
DEFVISITABLE(QueryVisitor<void>);
enum class Action { ANALYZE, DELETE };
memgraph::query::AnalyzeGraphQuery::Action action_;
std::vector<std::string> labels_;
AnalyzeGraphQuery *Clone(AstStorage *storage) const override {
auto *object = storage->Create<AnalyzeGraphQuery>();
object->action_ = action_;
object->labels_ = labels_;
return object;
}
};
class Exists : public memgraph::query::Expression {
public:
static const utils::TypeInfo kType;

View File

@ -2403,6 +2403,26 @@ cpp<#
(:serialize (:slk))
(:clone))
(lcp:define-class analyze-graph-query (query)
((action "Action" :scope :public)
(labels "std::vector<std::string>" :scope :public))
(:public
(lcp:define-enum action
(analyze delete)
(:serialize))
#>cpp
AnalyzeGraphQuery() = default;
DEFVISITABLE(QueryVisitor<void>);
cpp<#)
(:private
#>cpp
friend class AstStorage;
cpp<#)
(:serialize (:slk))
(:clone))
(lcp:define-class replication-query (query)
((action "Action" :scope :public)
(role "ReplicationRole" :scope :public)

View File

@ -95,6 +95,7 @@ class SettingQuery;
class VersionQuery;
class Foreach;
class ShowConfigQuery;
class AnalyzeGraphQuery;
class TransactionQueueQuery;
class Exists;
@ -131,7 +132,7 @@ template <class TResult>
class QueryVisitor
: public utils::Visitor<TResult, CypherQuery, ExplainQuery, ProfileQuery, IndexQuery, AuthQuery, InfoQuery,
ConstraintQuery, DumpQuery, ReplicationQuery, LockPathQuery, FreeMemoryQuery, TriggerQuery,
IsolationLevelQuery, CreateSnapshotQuery, StreamQuery, SettingQuery, TransactionQueueQuery,
VersionQuery, ShowConfigQuery> {};
IsolationLevelQuery, CreateSnapshotQuery, StreamQuery, SettingQuery, VersionQuery,
ShowConfigQuery, TransactionQueueQuery, AnalyzeGraphQuery> {};
} // namespace memgraph::query

View File

@ -241,6 +241,23 @@ antlrcpp::Any CypherMainVisitor::visitDumpQuery(MemgraphCypher::DumpQueryContext
return dump_query;
}
antlrcpp::Any CypherMainVisitor::visitAnalyzeGraphQuery(MemgraphCypher::AnalyzeGraphQueryContext *ctx) {
auto *analyze_graph_query = storage_->Create<AnalyzeGraphQuery>();
if (ctx->listOfColonSymbolicNames()) {
analyze_graph_query->labels_ =
std::any_cast<std::vector<std::string>>(ctx->listOfColonSymbolicNames()->accept(this));
} else {
analyze_graph_query->labels_.emplace_back("*");
}
if (ctx->DELETE()) {
analyze_graph_query->action_ = AnalyzeGraphQuery::Action::DELETE;
} else {
analyze_graph_query->action_ = AnalyzeGraphQuery::Action::ANALYZE;
}
query_ = analyze_graph_query;
return analyze_graph_query;
}
antlrcpp::Any CypherMainVisitor::visitReplicationQuery(MemgraphCypher::ReplicationQueryContext *ctx) {
MG_ASSERT(ctx->children.size() == 1, "ReplicationQuery should have exactly one child!");
auto *replication_query = std::any_cast<ReplicationQuery *>(ctx->children[0]->accept(this));
@ -1441,19 +1458,23 @@ antlrcpp::Any CypherMainVisitor::visitEntityPrivilegeList(MemgraphCypher::Entity
return result;
}
antlrcpp::Any CypherMainVisitor::visitListOfColonSymbolicNames(MemgraphCypher::ListOfColonSymbolicNamesContext *ctx) {
std::vector<std::string> symbolic_names;
for (auto *symbolic_name : ctx->colonSymbolicName()) {
symbolic_names.push_back(std::any_cast<std::string>(symbolic_name->symbolicName()->accept(this)));
}
return symbolic_names;
}
/**
* @return std::vector<std::string>
*/
antlrcpp::Any CypherMainVisitor::visitEntitiesList(MemgraphCypher::EntitiesListContext *ctx) {
std::vector<std::string> entities;
if (ctx->listOfEntities()) {
for (auto *entity : ctx->listOfEntities()->entity()) {
entities.push_back(std::any_cast<std::string>(entity->symbolicName()->accept(this)));
}
} else {
entities.emplace_back("*");
if (ctx->listOfColonSymbolicNames()) {
return ctx->listOfColonSymbolicNames()->accept(this);
}
entities.emplace_back("*");
return entities;
}

View File

@ -183,6 +183,16 @@ class CypherMainVisitor : public antlropencypher::MemgraphCypherBaseVisitor {
*/
antlrcpp::Any visitDumpQuery(MemgraphCypher::DumpQueryContext *ctx) override;
/**
@return std::vector<std::string>
*/
antlrcpp::Any visitListOfColonSymbolicNames(MemgraphCypher::ListOfColonSymbolicNamesContext *ctx) override;
/**
* @return AnalyzeGraphQuery*
*/
antlrcpp::Any visitAnalyzeGraphQuery(MemgraphCypher::AnalyzeGraphQueryContext *ctx) override;
/**
* @return ReplicationQuery*
*/

View File

@ -22,6 +22,7 @@ import Cypher ;
memgraphCypherKeyword : cypherKeyword
| AFTER
| ALTER
| ANALYZE
| ASYNC
| AUTH
| BAD
@ -53,6 +54,7 @@ memgraphCypherKeyword : cypherKeyword
| FREE
| FROM
| GLOBAL
| GRAPH
| GRANT
| HEADER
| IDENTIFIED
@ -119,6 +121,7 @@ query : cypherQuery
| constraintQuery
| authQuery
| dumpQuery
| analyzeGraphQuery
| replicationQuery
| lockPathQuery
| freeMemoryQuery
@ -291,11 +294,11 @@ revokePrivilegesList : privilegeOrEntities ( ',' privilegeOrEntities )* ;
privilegesList : privilege ( ',' privilege )* ;
entitiesList : ASTERISK | listOfEntities ;
entitiesList : ASTERISK | listOfColonSymbolicNames ;
listOfEntities : entity ( ',' entity )* ;
listOfColonSymbolicNames : colonSymbolicName ( ',' colonSymbolicName )* ;
entity : COLON symbolicName ;
colonSymbolicName : COLON symbolicName ;
showPrivileges : SHOW PRIVILEGES FOR userOrRole=userOrRoleName ;
@ -305,6 +308,8 @@ showUsersForRole : SHOW USERS FOR role=userOrRoleName ;
dumpQuery: DUMP DATABASE ;
analyzeGraphQuery: ANALYZE GRAPH ( ON LABELS ( listOfColonSymbolicNames | ASTERISK ) ) ? ( DELETE STATISTICS ) ? ;
setReplicationRole : SET REPLICATION ROLE TO ( MAIN | REPLICA )
( WITH PORT port=literal ) ? ;

View File

@ -27,6 +27,7 @@ UNDERSCORE : '_' ;
AFTER : A F T E R ;
ALTER : A L T E R ;
ANALYZE : A N A L Y Z E ;
ASYNC : A S Y N C ;
AUTH : A U T H ;
BAD : B A D ;
@ -62,6 +63,7 @@ FREE_MEMORY : F R E E UNDERSCORE M E M O R Y ;
FROM : F R O M ;
GLOBAL : G L O B A L ;
GRANT : G R A N T ;
GRAPH : G R A P H ;
GRANTS : G R A N T S ;
HEADER : H E A D E R ;
IDENTIFIED : I D E N T I F I E D ;
@ -99,6 +101,7 @@ SETTING : S E T T I N G ;
SETTINGS : S E T T I N G S ;
SNAPSHOT : S N A P S H O T ;
START : S T A R T ;
STATISTICS : S T A T I S T I C S ;
STATS : S T A T S ;
STOP : S T O P ;
STREAM : S T R E A M ;

View File

@ -25,9 +25,11 @@ class PrivilegeExtractor : public QueryVisitor<void>, public HierarchicalTreeVis
std::vector<AuthQuery::Privilege> privileges() { return privileges_; }
void Visit(IndexQuery &) override { AddPrivilege(AuthQuery::Privilege::INDEX); }
void Visit(IndexQuery & /*unused*/) override { AddPrivilege(AuthQuery::Privilege::INDEX); }
void Visit(AuthQuery &) override { AddPrivilege(AuthQuery::Privilege::AUTH); }
void Visit(AnalyzeGraphQuery & /*unused*/) override { AddPrivilege(AuthQuery::Privilege::INDEX); }
void Visit(AuthQuery & /*unused*/) override { AddPrivilege(AuthQuery::Privilege::AUTH); }
void Visit(ExplainQuery &query) override { query.cypher_query_->Accept(*this); }

View File

@ -331,11 +331,11 @@ Callback HandleAuthQuery(AuthQuery *auth_query, AuthQueryHandler *auth, const Pa
auth->GrantPrivilege(username, kPrivilegesAll
#ifdef MG_ENTERPRISE
,
{{{AuthQuery::FineGrainedPrivilege::CREATE_DELETE, {auth::kAsterisk}}}},
{{{AuthQuery::FineGrainedPrivilege::CREATE_DELETE, {query::kAsterisk}}}},
{
{
{
AuthQuery::FineGrainedPrivilege::CREATE_DELETE, { auth::kAsterisk }
AuthQuery::FineGrainedPrivilege::CREATE_DELETE, { query::kAsterisk }
}
}
}
@ -1408,6 +1408,139 @@ PreparedQuery PrepareDumpQuery(ParsedQuery parsed_query, std::map<std::string, T
RWType::R};
}
std::vector<std::vector<TypedValue>> AnalyzeGraphQueryHandler::AnalyzeGraphCreateStatistics(
const std::span<std::string> labels, DbAccessor *execution_db_accessor) {
using LPIndex = std::pair<storage::LabelId, storage::PropertyId>;
std::vector<std::vector<TypedValue>> results;
std::map<LPIndex, std::map<storage::PropertyValue, int64_t>> counter;
// Preprocess labels to avoid later checks
std::vector<LPIndex> indices_info = execution_db_accessor->ListAllIndices().label_property;
if (labels[0] != kAsterisk) {
for (auto it = indices_info.cbegin(); it != indices_info.cend();) {
if (std::find(labels.begin(), labels.end(), execution_db_accessor->LabelToName(it->first)) == labels.end()) {
it = indices_info.erase(it);
} else {
++it;
}
}
}
// Iterate over all indexed vertices
std::for_each(indices_info.begin(), indices_info.end(), [execution_db_accessor, &counter](const LPIndex &index_info) {
auto vertices = execution_db_accessor->Vertices(storage::View::OLD, index_info.first, index_info.second);
std::for_each(vertices.begin(), vertices.end(), [&index_info, &counter](const auto &vertex) {
counter[index_info][*vertex.GetProperty(storage::View::OLD, index_info.second)]++;
});
});
results.reserve(counter.size());
std::for_each(counter.begin(), counter.end(), [&results, execution_db_accessor](const auto &counter_entry) {
const auto &[label_property, values_map] = counter_entry;
std::vector<TypedValue> result;
result.reserve(kDeleteStatisticsNumResults);
// Extract info
int64_t count_property_value = std::accumulate(
values_map.begin(), values_map.end(), 0,
[](int64_t prev_value, const auto &prop_value_count) { return prev_value + prop_value_count.second; });
// num_distinc_values will never be 0
double avg_group_size = static_cast<double>(count_property_value) / static_cast<double>(values_map.size());
double chi_squared_stat = std::accumulate(
values_map.begin(), values_map.end(), 0.0, [avg_group_size](double prev_result, const auto &value_entry) {
return prev_result + utils::ChiSquaredValue(value_entry.second, avg_group_size);
});
execution_db_accessor->SetIndexStats(
label_property.first, label_property.second,
storage::IndexStats{.statistic = chi_squared_stat, .avg_group_size = avg_group_size});
// Save result
result.emplace_back(execution_db_accessor->LabelToName(label_property.first));
result.emplace_back(execution_db_accessor->PropertyToName(label_property.second));
result.emplace_back(count_property_value);
result.emplace_back(static_cast<int64_t>(values_map.size()));
result.emplace_back(avg_group_size);
result.emplace_back(chi_squared_stat);
results.push_back(std::move(result));
});
return results;
}
std::vector<std::vector<TypedValue>> AnalyzeGraphQueryHandler::AnalyzeGraphDeleteStatistics(
const std::span<std::string> labels, DbAccessor *execution_db_accessor) {
std::vector<std::pair<storage::LabelId, storage::PropertyId>> loc_results;
if (labels[0] == kAsterisk) {
loc_results = execution_db_accessor->ClearIndexStats();
} else {
loc_results = execution_db_accessor->DeleteIndexStatsForLabels(labels);
}
std::vector<std::vector<TypedValue>> results;
std::transform(loc_results.begin(), loc_results.end(), std::back_inserter(results),
[execution_db_accessor](const auto &label_property_index) {
return std::vector<TypedValue>{
TypedValue(execution_db_accessor->LabelToName(label_property_index.first)),
TypedValue(execution_db_accessor->PropertyToName(label_property_index.second))};
});
return results;
}
Callback HandleAnalyzeGraphQuery(AnalyzeGraphQuery *analyze_graph_query, DbAccessor *execution_db_accessor) {
Callback callback;
switch (analyze_graph_query->action_) {
case AnalyzeGraphQuery::Action::ANALYZE: {
callback.header = {"label", "property", "num estimation nodes",
"num groups", "avg group size", "chi-squared value"};
callback.fn = [handler = AnalyzeGraphQueryHandler(), labels = analyze_graph_query->labels_,
execution_db_accessor]() mutable {
return handler.AnalyzeGraphCreateStatistics(labels, execution_db_accessor);
};
break;
}
case AnalyzeGraphQuery::Action::DELETE: {
callback.header = {"label", "property"};
callback.fn = [handler = AnalyzeGraphQueryHandler(), labels = analyze_graph_query->labels_,
execution_db_accessor]() mutable {
return handler.AnalyzeGraphDeleteStatistics(labels, execution_db_accessor);
};
break;
}
}
return callback;
}
PreparedQuery PrepareAnalyzeGraphQuery(ParsedQuery parsed_query, bool in_explicit_transaction,
DbAccessor *execution_db_accessor, InterpreterContext *interpreter_context) {
if (in_explicit_transaction) {
throw AnalyzeGraphInMulticommandTxException();
}
// Creating an index influences computed plan costs.
auto invalidate_plan_cache = [plan_cache = &interpreter_context->plan_cache] {
auto access = plan_cache->access();
for (auto &kv : access) {
access.remove(kv.first);
}
};
utils::OnScopeExit cache_invalidator(invalidate_plan_cache);
auto *analyze_graph_query = utils::Downcast<AnalyzeGraphQuery>(parsed_query.query);
MG_ASSERT(analyze_graph_query);
auto callback = HandleAnalyzeGraphQuery(analyze_graph_query, execution_db_accessor);
return PreparedQuery{std::move(callback.header), std::move(parsed_query.required_privileges),
[callback_fn = std::move(callback.fn), pull_plan = std::shared_ptr<PullPlanVector>{nullptr}](
AnyStream *stream, std::optional<int> n) mutable -> std::optional<QueryHandlerResult> {
if (UNLIKELY(!pull_plan)) {
pull_plan = std::make_shared<PullPlanVector>(callback_fn());
}
if (pull_plan->Pull(stream, n)) {
return QueryHandlerResult::COMMIT;
}
return std::nullopt;
},
RWType::NONE};
}
PreparedQuery PrepareIndexQuery(ParsedQuery parsed_query, bool in_explicit_transaction,
std::vector<Notification> *notifications, InterpreterContext *interpreter_context) {
if (in_explicit_transaction) {
@ -2504,7 +2637,7 @@ Interpreter::PrepareResult Interpreter::Prepare(const std::string &query_string,
if (!in_explicit_transaction_ &&
(utils::Downcast<CypherQuery>(parsed_query.query) || utils::Downcast<ExplainQuery>(parsed_query.query) ||
utils::Downcast<ProfileQuery>(parsed_query.query) || utils::Downcast<DumpQuery>(parsed_query.query) ||
utils::Downcast<TriggerQuery>(parsed_query.query) ||
utils::Downcast<TriggerQuery>(parsed_query.query) || utils::Downcast<AnalyzeGraphQuery>(parsed_query.query) ||
utils::Downcast<TransactionQueueQuery>(parsed_query.query))) {
db_accessor_ =
std::make_unique<storage::Storage::Accessor>(interpreter_context_->db->Access(GetIsolationLevelOverride()));
@ -2537,6 +2670,9 @@ Interpreter::PrepareResult Interpreter::Prepare(const std::string &query_string,
} else if (utils::Downcast<IndexQuery>(parsed_query.query)) {
prepared_query = PrepareIndexQuery(std::move(parsed_query), in_explicit_transaction_,
&query_execution->notifications, interpreter_context_);
} else if (utils::Downcast<AnalyzeGraphQuery>(parsed_query.query)) {
prepared_query = PrepareAnalyzeGraphQuery(std::move(parsed_query), in_explicit_transaction_,
&*execution_db_accessor_, interpreter_context_);
} else if (utils::Downcast<AuthQuery>(parsed_query.query)) {
prepared_query = PrepareAuthQuery(
std::move(parsed_query), in_explicit_transaction_, &query_execution->summary, interpreter_context_,

View File

@ -172,6 +172,24 @@ class ReplicationQueryHandler {
virtual std::vector<Replica> ShowReplicas() const = 0;
};
class AnalyzeGraphQueryHandler {
public:
AnalyzeGraphQueryHandler() = default;
virtual ~AnalyzeGraphQueryHandler() = default;
AnalyzeGraphQueryHandler(const AnalyzeGraphQueryHandler &) = default;
AnalyzeGraphQueryHandler &operator=(const AnalyzeGraphQueryHandler &) = default;
AnalyzeGraphQueryHandler(AnalyzeGraphQueryHandler &&) = default;
AnalyzeGraphQueryHandler &operator=(AnalyzeGraphQueryHandler &&) = default;
static std::vector<std::vector<TypedValue>> AnalyzeGraphCreateStatistics(const std::span<std::string> labels,
DbAccessor *execution_db_accessor);
static std::vector<std::vector<TypedValue>> AnalyzeGraphDeleteStatistics(const std::span<std::string> labels,
DbAccessor *execution_db_accessor);
};
/**
* A container for data related to the preparation of a query.
*/

View File

@ -27,6 +27,7 @@
#include "query/plan/operator.hpp"
#include "query/plan/preprocess.hpp"
#include "storage/v2/indices.hpp"
DECLARE_int64(query_vertex_count_to_expand_existing);
@ -482,6 +483,7 @@ class IndexLookupRewriter final : public HierarchicalLogicalOperatorVisitor {
// FilterInfo with PropertyFilter.
FilterInfo filter;
int64_t vertex_count;
std::optional<storage::IndexStats> index_stats;
};
bool DefaultPreVisit() override { throw utils::NotYetImplemented("optimizing index lookup"); }
@ -522,8 +524,11 @@ class IndexLookupRewriter final : public HierarchicalLogicalOperatorVisitor {
return best_label;
}
// Finds the label-property combination which has indexed the lowest amount of
// vertices. If the index cannot be found, nullopt is returned.
// Finds the label-property combination. The first criteria based on number of vertices indexed -> if one index has
// 10x less than the other one, always choose the smaller one. Otherwise, choose the index with smallest average group
// size based on key distribution. If average group size is equal, choose the index that has distribution closer to
// uniform distribution. Conditions based on average group size and key distribution can be only taken into account if
// the user has run `ANALYZE GRAPH` query before If the index cannot be found, nullopt is returned.
std::optional<LabelPropertyIndex> FindBestLabelPropertyIndex(const Symbol &symbol,
const std::unordered_set<Symbol> &bound_symbols) {
auto are_bound = [&bound_symbols](const auto &used_symbols) {
@ -534,6 +539,27 @@ class IndexLookupRewriter final : public HierarchicalLogicalOperatorVisitor {
}
return true;
};
/*
* Comparator function between two indices. If new index has >= 10x vertices than the existing, it cannot be better.
* If it is <= 10x in number of vertices, check average group size of property values. The index with smaller
* average group size is better. If the average group size is the same, choose the one closer to the uniform
* distribution
* @param found: Current best label-property index.
* @param new_stats: Label-property index candidate.
* @param vertex_count: New index's number of vertices.
* @return -1 if the new index is better, 0 if they are equal and 1 if the existing one is better.
*/
auto compare_indices = [](std::optional<LabelPropertyIndex> &found, std::optional<storage::IndexStats> &new_stats,
int vertex_count) {
if (!new_stats.has_value() || vertex_count / 10.0 > found->vertex_count) {
return 1;
}
int cmp_avg_group = utils::CompareDecimal(new_stats->avg_group_size, found->index_stats->avg_group_size);
if (cmp_avg_group != 0) return cmp_avg_group;
return utils::CompareDecimal(new_stats->statistic, found->index_stats->statistic);
};
std::optional<LabelPropertyIndex> found;
for (const auto &label : filters_.FilteredLabels(symbol)) {
for (const auto &filter : filters_.PropertyFilters(symbol)) {
@ -548,7 +574,6 @@ class IndexLookupRewriter final : public HierarchicalLogicalOperatorVisitor {
if (!db_->LabelPropertyIndexExists(GetLabel(label), GetProperty(property))) {
continue;
}
int64_t vertex_count = db_->VerticesCount(GetLabel(label), GetProperty(property));
auto is_better_type = [&found](PropertyFilter::Type type) {
// Order the types by the most preferred index lookup type.
static const PropertyFilter::Type kFilterTypeOrder[] = {
@ -557,17 +582,32 @@ class IndexLookupRewriter final : public HierarchicalLogicalOperatorVisitor {
auto *type_sort_ix = std::find(kFilterTypeOrder, kFilterTypeOrder + 3, type);
return type_sort_ix < found_sort_ix;
};
if (!found || vertex_count < found->vertex_count ||
(vertex_count == found->vertex_count && is_better_type(filter.property_filter->type_))) {
found = LabelPropertyIndex{label, filter, vertex_count};
int64_t vertex_count = db_->VerticesCount(GetLabel(label), GetProperty(property));
std::optional<storage::IndexStats> new_stats = db_->GetIndexStats(GetLabel(label), GetProperty(property));
// Conditions, from more to less important:
// the index with 10x less vertices is better.
// the index with smaller average group size is better.
// the index with equal avg group size and distribution closer to the uniform is better.
// the index with less vertices is better.
// the index with same number of vertices but more optimized filter is better.
if (!found || vertex_count * 10 < found->vertex_count) {
found = LabelPropertyIndex{label, filter, vertex_count, new_stats};
continue;
}
if (int cmp_res = compare_indices(found, new_stats, vertex_count);
cmp_res == -1 ||
cmp_res == 0 && (found->vertex_count > vertex_count ||
found->vertex_count == vertex_count && is_better_type(filter.property_filter->type_))) {
found = LabelPropertyIndex{label, filter, vertex_count, new_stats};
}
}
}
return found;
}
// Creates a ScanAll by the best possible index for the `node_symbol`. Best
// index is defined as the index with least number of vertices. If the node
// Creates a ScanAll by the best possible index for the `node_symbol`. If the node
// does not have at least a label, no indexed lookup can be created and
// `nullptr` is returned. The operator is chained after `input`. Optional
// `max_vertex_count` controls, whether no operator should be created if the

View File

@ -1,4 +1,4 @@
// Copyright 2022 Memgraph Ltd.
// Copyright 2023 Memgraph Ltd.
//
// Use of this software is governed by the Business Source License
// included in the file licenses/BSL.txt; by using this file, you agree to be bound by the terms of the Business Source
@ -78,6 +78,11 @@ class VertexCountCache {
return db_->LabelPropertyIndexExists(label, property);
}
std::optional<storage::IndexStats> GetIndexStats(const storage::LabelId &label,
const storage::PropertyId &property) const {
return db_->GetIndexStats(label, property);
}
private:
typedef std::pair<storage::LabelId, storage::PropertyId> LabelPropertyKey;

View File

@ -1,4 +1,4 @@
// Copyright 2022 Memgraph Ltd.
// Copyright 2023 Memgraph Ltd.
//
// Use of this software is governed by the Business Source License
// included in the file licenses/BSL.txt; by using this file, you agree to be bound by the terms of the Business Source
@ -10,6 +10,8 @@
// licenses/APL.txt.
#include "indices.hpp"
#include <algorithm>
#include <iterator>
#include <limits>
#include "storage/v2/mvcc.hpp"
@ -688,6 +690,45 @@ int64_t LabelPropertyIndex::ApproximateVertexCount(LabelId label, PropertyId pro
return acc.estimate_range_count(lower, upper, utils::SkipListLayerForCountEstimation(acc.size()));
}
/*
Iterate over all property-label pairs and deletes if label from the index is equal to label parameter.
*/
std::vector<std::pair<LabelId, PropertyId>> LabelPropertyIndex::DeleteIndexStatsForLabel(
const storage::LabelId &label) {
std::vector<std::pair<LabelId, PropertyId>> deleted_indexes;
for (auto it = stats_.cbegin(); it != stats_.cend();) {
if (it->first.first == label) {
deleted_indexes.push_back(it->first);
it = stats_.erase(it);
} else {
++it;
}
}
return deleted_indexes;
}
std::vector<std::pair<LabelId, PropertyId>> LabelPropertyIndex::ClearIndexStats() {
std::vector<std::pair<LabelId, PropertyId>> deleted_indexes;
deleted_indexes.reserve(stats_.size());
std::transform(stats_.begin(), stats_.end(), std::back_inserter(deleted_indexes),
[](const auto &elem) { return elem.first; });
stats_.clear();
return deleted_indexes;
}
void LabelPropertyIndex::SetIndexStats(const storage::LabelId &label, const storage::PropertyId &property,
const IndexStats &stats) {
stats_[{label, property}] = stats;
}
std::optional<IndexStats> LabelPropertyIndex::GetIndexStats(const storage::LabelId &label,
const storage::PropertyId &property) const {
if (auto it = stats_.find({label, property}); it != stats_.end()) {
return it->second;
}
return {};
}
void LabelPropertyIndex::RunGC() {
for (auto &index_entry : index_) {
index_entry.second.run_gc();

View File

@ -1,4 +1,4 @@
// Copyright 2022 Memgraph Ltd.
// Copyright 2023 Memgraph Ltd.
//
// Use of this software is governed by the Business Source License
// included in the file licenses/BSL.txt; by using this file, you agree to be bound by the terms of the Business Source
@ -131,6 +131,10 @@ class LabelIndex {
Config::Items config_;
};
struct IndexStats {
double statistic, avg_group_size;
};
class LabelPropertyIndex {
private:
struct Entry {
@ -237,12 +241,23 @@ class LabelPropertyIndex {
const std::optional<utils::Bound<PropertyValue>> &lower,
const std::optional<utils::Bound<PropertyValue>> &upper) const;
std::vector<std::pair<LabelId, PropertyId>> ClearIndexStats();
std::vector<std::pair<LabelId, PropertyId>> DeleteIndexStatsForLabel(const storage::LabelId &label);
void SetIndexStats(const storage::LabelId &label, const storage::PropertyId &property,
const storage::IndexStats &stats);
std::optional<storage::IndexStats> GetIndexStats(const storage::LabelId &label,
const storage::PropertyId &property) const;
void Clear() { index_.clear(); }
void RunGC();
private:
std::map<std::pair<LabelId, PropertyId>, utils::SkipList<Entry>> index_;
std::map<std::pair<LabelId, PropertyId>, storage::IndexStats> stats_;
Indices *indices_;
Constraints *constraints_;
Config::Items config_;

View File

@ -1,4 +1,4 @@
// Copyright 2022 Memgraph Ltd.
// Copyright 2023 Memgraph Ltd.
//
// Use of this software is governed by the Business Source License
// included in the file licenses/BSL.txt; by using this file, you agree to be bound by the terms of the Business Source

View File

@ -16,6 +16,7 @@
#include <filesystem>
#include <optional>
#include <shared_mutex>
#include <span>
#include <variant>
#include "io/network/endpoint.hpp"
@ -265,6 +266,30 @@ class Storage final {
return storage_->indices_.label_property_index.ApproximateVertexCount(label, property, lower, upper);
}
std::optional<storage::IndexStats> GetIndexStats(const storage::LabelId &label,
const storage::PropertyId &property) const {
return storage_->indices_.label_property_index.GetIndexStats(label, property);
}
std::vector<std::pair<LabelId, PropertyId>> ClearIndexStats() {
return storage_->indices_.label_property_index.ClearIndexStats();
}
std::vector<std::pair<LabelId, PropertyId>> DeleteIndexStatsForLabels(const std::span<std::string> labels) {
std::vector<std::pair<LabelId, PropertyId>> deleted_indexes;
std::for_each(labels.begin(), labels.end(), [this, &deleted_indexes](const auto &label_str) {
std::vector<std::pair<LabelId, PropertyId>> loc_results =
storage_->indices_.label_property_index.DeleteIndexStatsForLabel(NameToLabel(label_str));
deleted_indexes.insert(deleted_indexes.end(), std::make_move_iterator(loc_results.begin()),
std::make_move_iterator(loc_results.end()));
});
return deleted_indexes;
}
void SetIndexStats(const storage::LabelId &label, const storage::PropertyId &property, const IndexStats &stats) {
storage_->indices_.label_property_index.SetIndexStats(label, property, stats);
}
/// @return Accessor to the deleted vertex if a deletion took place, std::nullopt otherwise
/// @throw std::bad_alloc
Result<std::optional<VertexAccessor>> DeleteVertex(VertexAccessor *vertex);

View File

@ -1,4 +1,4 @@
// Copyright 2022 Memgraph Ltd.
// Copyright 2023 Memgraph Ltd.
//
// Use of this software is governed by the Business Source License
// included in the file licenses/BSL.txt; by using this file, you agree to be bound by the terms of the Business Source
@ -11,11 +11,15 @@
#pragma once
#include <boost/math/special_functions/math_fwd.hpp>
#include <cmath>
#include <cstdint>
#include <limits>
#include <optional>
#include <type_traits>
#include <boost/math/special_functions/relative_difference.hpp>
namespace memgraph::utils {
static_assert(std::is_same_v<uint64_t, unsigned long>,
@ -64,4 +68,36 @@ constexpr std::optional<uint64_t> RoundUint64ToMultiple(uint64_t val, uint64_t m
return (numerator / multiple) * multiple;
}
template <typename T>
concept FloatingPoint = std::is_floating_point_v<T>;
template <FloatingPoint T>
bool ApproxEqualDecimal(T a, T b) {
return boost::math::relative_difference(a, b) < std::numeric_limits<T>::epsilon();
}
template <FloatingPoint T>
bool LessThanDecimal(T a, T b) {
return (b - a) > std::numeric_limits<T>::epsilon();
}
/*
* return 0 if a == b
* return 1 if a > b
* return -1 if a < b
*/
template <FloatingPoint T>
int CompareDecimal(T a, T b) {
if (ApproxEqualDecimal(a, b)) return 0;
if (LessThanDecimal(a, b)) return -1;
return 1;
}
constexpr double ChiSquaredValue(double observed, double expected) {
if (utils::ApproxEqualDecimal(expected, 0.0)) {
return std::numeric_limits<double>::max();
}
return (observed - expected) * (observed - expected) / expected;
}
} // namespace memgraph::utils

View File

@ -176,9 +176,9 @@ enum class TypeId : uint64_t {
AST_VERSION_QUERY,
AST_FOREACH,
AST_SHOW_CONFIG_QUERY,
AST_ANALYZE_GRAPH_QUERY,
AST_TRANSACTION_QUEUE_QUERY,
AST_EXISTS,
// Symbol
SYMBOL,
};

View File

@ -44,6 +44,7 @@ add_subdirectory(module_file_manager)
add_subdirectory(monitoring_server)
add_subdirectory(lba_procedures)
add_subdirectory(python_query_modules_reloading)
add_subdirectory(analyze_graph)
add_subdirectory(transaction_queue)
add_subdirectory(mock_api)

View File

@ -0,0 +1,6 @@
function(copy_analyze_graph_e2e_python_files FILE_NAME)
copy_e2e_python_files(analyze_graph ${FILE_NAME})
endfunction()
copy_analyze_graph_e2e_python_files(common.py)
copy_analyze_graph_e2e_python_files(optimize_indexes.py)

View File

@ -0,0 +1,29 @@
# Copyright 2023 Memgraph Ltd.
#
# Use of this software is governed by the Business Source License
# included in the file licenses/BSL.txt; by using this file, you agree to be bound by the terms of the Business Source
# License, and you may not use this file except in compliance with the Business Source License.
#
# As of the Change Date specified in that file, in accordance with
# the Business Source License, use of this software will be governed
# by the Apache License, Version 2.0, included in the file
# licenses/APL.txt.
import typing
import mgclient
import pytest
def execute_and_fetch_all(cursor: mgclient.Cursor, query: str, params: dict = {}) -> typing.List[tuple]:
cursor.execute(query, params)
return cursor.fetchall()
@pytest.fixture
def connect(**kwargs) -> mgclient.Connection:
connection = mgclient.connect(host="localhost", port=7687, **kwargs)
connection.autocommit = True
yield connection
cursor = connection.cursor()
execute_and_fetch_all(cursor, "MATCH (n) DETACH DELETE n")

View File

@ -0,0 +1,282 @@
# Copyright 2023 Memgraph Ltd.
#
# Use of this software is governed by the Business Source License
# included in the file licenses/BSL.txt; by using this file, you agree to be bound by the terms of the Business Source
# License, and you may not use this file except in compliance with the Business Source License.
#
# As of the Change Date specified in that file, in accordance with
# the Business Source License, use of this software will be governed
# by the Apache License, Version 2.0, included in the file
# licenses/APL.txt.
import sys
import pytest
from common import connect, execute_and_fetch_all
# E2E tests for checking query semantic
# ------------------------------------
@pytest.mark.parametrize(
"delete_query",
[
"ANALYZE GRAPH DELETE STATISTICS",
"ANALYZE GRAPH ON LABELS * DELETE STATISTICS",
"ANALYZE GRAPH ON LABELS :Label DELETE STATISTICS",
"ANALYZE GRAPH ON LABELS :Label, :NONEXISTING DELETE STATISTICS",
],
)
def test_analyze_graph_delete_statistics(delete_query, connect):
"""Tests that all variants of delete queries work as expected."""
cursor = connect.cursor()
execute_and_fetch_all(cursor, "FOREACH (i IN range(1, 100) | CREATE (n:Label {id1: i}));")
execute_and_fetch_all(cursor, "FOREACH (i IN range(1, 50) | CREATE (n:Label {id2: i % 5}));")
execute_and_fetch_all(cursor, "CREATE INDEX ON :Label(id1);")
execute_and_fetch_all(cursor, "CREATE INDEX ON :Label(id2);")
analyze_graph_results = execute_and_fetch_all(cursor, "ANALYZE GRAPH")
assert len(analyze_graph_results) == 2
delete_stats_results = execute_and_fetch_all(cursor, delete_query)
assert len(delete_stats_results) == 2
if delete_stats_results[0][1] == "id1":
first_index = 0
else:
first_index = 1
assert delete_stats_results[first_index] == ("Label", "id1")
assert delete_stats_results[1 - first_index] == ("Label", "id2")
# After deleting statistics, id2 should be chosen because it has less vertices
expected_explain_after_delete_analysis = [
(f" * Produce {{n}}",),
(f" * Filter",),
(f" * ScanAllByLabelPropertyValue (n :Label {{id2}})",),
(f" * Once",),
]
assert (
execute_and_fetch_all(cursor, "EXPLAIN MATCH (n:Label) WHERE n.id2 = 3 AND n.id1 = 3 RETURN n;")
== expected_explain_after_delete_analysis
)
execute_and_fetch_all(cursor, "DROP INDEX ON :Label(id1);")
execute_and_fetch_all(cursor, "DROP INDEX ON :Label(id2);")
@pytest.mark.parametrize(
"analyze_query",
[
"ANALYZE GRAPH",
"ANALYZE GRAPH ON LABELS *",
"ANALYZE GRAPH ON LABELS :Label",
"ANALYZE GRAPH ON LABELS :Label, :NONEXISTING",
],
)
def test_analyze_full_graph(analyze_query, connect):
"""Tests analyzing full graph and choosing better index based on the smaller average group size.
It also tests querying based on labels and that nothing bad will happen by providing non-existing label.
"""
cursor = connect.cursor()
execute_and_fetch_all(cursor, "FOREACH (i IN range(1, 100) | CREATE (n:Label {id1: i}));")
execute_and_fetch_all(cursor, "FOREACH (i IN range(1, 50) | CREATE (n:Label {id2: i % 5}));")
execute_and_fetch_all(cursor, "CREATE INDEX ON :Label(id1);")
execute_and_fetch_all(cursor, "CREATE INDEX ON :Label(id2);")
# Choose id2 before tha analysis because it has less vertices
expected_explain_before_analysis = [
(f" * Produce {{n}}",),
(f" * Filter",),
(f" * ScanAllByLabelPropertyValue (n :Label {{id2}})",),
(f" * Once",),
]
assert (
execute_and_fetch_all(cursor, "EXPLAIN MATCH (n:Label) WHERE n.id2 = 3 AND n.id1 = 3 RETURN n;")
== expected_explain_before_analysis
)
# Run analyze query
analyze_graph_results = execute_and_fetch_all(cursor, analyze_query)
assert len(analyze_graph_results) == 2
if analyze_graph_results[0][1] == "id1":
first_index = 0
else:
first_index = 1
# Check results
assert analyze_graph_results[first_index] == ("Label", "id1", 100, 100, 1, 0)
assert analyze_graph_results[1 - first_index] == ("Label", "id2", 50, 5, 10, 0)
# After analyzing graph, id1 index should be chosen because it has smaller average group size
expected_explain_after_analysis = [
(f" * Produce {{n}}",),
(f" * Filter",),
(f" * ScanAllByLabelPropertyValue (n :Label {{id1}})",),
(f" * Once",),
]
assert (
execute_and_fetch_all(cursor, "EXPLAIN MATCH (n:Label) WHERE n.id2 = 3 AND n.id1 = 3 RETURN n;")
== expected_explain_after_analysis
)
assert len(execute_and_fetch_all(cursor, "ANALYZE GRAPH DELETE STATISTICS")) == 2
execute_and_fetch_all(cursor, "DROP INDEX ON :Label(id1);")
execute_and_fetch_all(cursor, "DROP INDEX ON :Label(id2);")
# Explicit index choosing tests
# -----------------------------
def test_cardinality_different_avg_group_size_uniform_dist(connect):
"""Tests index optimization with indices both having uniform distribution but one has smaller avg. group size."""
cursor = connect.cursor()
execute_and_fetch_all(cursor, "FOREACH (i IN range(1, 100) | CREATE (n:Label {id1: i}));")
execute_and_fetch_all(cursor, "FOREACH (i IN range(1, 100) | CREATE (n:Label {id2: i % 20}));")
execute_and_fetch_all(cursor, "CREATE INDEX ON :Label(id1);")
execute_and_fetch_all(cursor, "CREATE INDEX ON :Label(id2);")
analyze_graph_results = execute_and_fetch_all(cursor, "ANALYZE GRAPH")
if analyze_graph_results[0][1] == "id1":
first_index = 0
else:
first_index = 1
# Check results
assert analyze_graph_results[first_index] == ("Label", "id1", 100, 100, 1, 0)
assert analyze_graph_results[1 - first_index] == ("Label", "id2", 100, 20, 5, 0)
expected_explain_after_analysis = [
(f" * Produce {{n}}",),
(f" * Filter",),
(f" * ScanAllByLabelPropertyValue (n :Label {{id1}})",),
(f" * Once",),
]
assert (
execute_and_fetch_all(cursor, "EXPLAIN MATCH (n:Label) WHERE n.id2 = 3 AND n.id1 = 3 RETURN n;")
== expected_explain_after_analysis
)
assert len(execute_and_fetch_all(cursor, "ANALYZE GRAPH DELETE STATISTICS")) == 2
execute_and_fetch_all(cursor, "DROP INDEX ON :Label(id1);")
execute_and_fetch_all(cursor, "DROP INDEX ON :Label(id2);")
def test_cardinality_same_avg_group_size_uniform_dist_diff_vertex_count(connect):
"""Tests index choosing where both indices have uniform key distribution with same avg. group size but one has less vertices."""
cursor = connect.cursor()
execute_and_fetch_all(cursor, "FOREACH (i IN range(1, 100) | CREATE (n:Label {id1: i}));")
execute_and_fetch_all(cursor, "FOREACH (i IN range(1, 50) | CREATE (n:Label {id2: i}));")
execute_and_fetch_all(cursor, "CREATE INDEX ON :Label(id1);")
execute_and_fetch_all(cursor, "CREATE INDEX ON :Label(id2);")
analyze_graph_results = execute_and_fetch_all(cursor, "ANALYZE GRAPH")
if analyze_graph_results[0][1] == "id1":
first_index = 0
else:
first_index = 1
# Check results
assert analyze_graph_results[first_index] == ("Label", "id1", 100, 100, 1, 0)
assert analyze_graph_results[1 - first_index] == ("Label", "id2", 50, 50, 1, 0)
expected_explain_after_analysis = [
(f" * Produce {{n}}",),
(f" * Filter",),
(f" * ScanAllByLabelPropertyValue (n :Label {{id2}})",),
(f" * Once",),
]
assert (
execute_and_fetch_all(cursor, "EXPLAIN MATCH (n:Label) WHERE n.id2 = 3 AND n.id1 = 3 RETURN n;")
== expected_explain_after_analysis
)
assert len(execute_and_fetch_all(cursor, "ANALYZE GRAPH DELETE STATISTICS")) == 2
execute_and_fetch_all(cursor, "DROP INDEX ON :Label(id1);")
execute_and_fetch_all(cursor, "DROP INDEX ON :Label(id2);")
def test_large_diff_in_num_vertices_v1(connect):
"""Tests that when one index has > 10x vertices than the other one, it should be chosen no matter avg group size and uniform distribution."""
cursor = connect.cursor()
execute_and_fetch_all(cursor, "FOREACH (i IN range(1, 1000) | CREATE (n:Label {id1: i}));")
execute_and_fetch_all(cursor, "FOREACH (i IN range(1, 99) | CREATE (n:Label {id2: 1}));")
execute_and_fetch_all(cursor, "CREATE INDEX ON :Label(id1);")
execute_and_fetch_all(cursor, "CREATE INDEX ON :Label(id2);")
analyze_graph_results = execute_and_fetch_all(cursor, "ANALYZE GRAPH")
if analyze_graph_results[0][1] == "id1":
first_index = 0
else:
first_index = 1
# Check results
assert analyze_graph_results[first_index] == ("Label", "id1", 1000, 1000, 1, 0)
assert analyze_graph_results[1 - first_index] == ("Label", "id2", 99, 1, 99, 0)
expected_explain_after_analysis = [
(f" * Produce {{n}}",),
(f" * Filter",),
(f" * ScanAllByLabelPropertyValue (n :Label {{id2}})",),
(f" * Once",),
]
assert (
execute_and_fetch_all(cursor, "EXPLAIN MATCH (n:Label) WHERE n.id2 = 3 AND n.id1 = 3 RETURN n;")
== expected_explain_after_analysis
)
assert len(execute_and_fetch_all(cursor, "ANALYZE GRAPH DELETE STATISTICS")) == 2
execute_and_fetch_all(cursor, "DROP INDEX ON :Label(id1);")
execute_and_fetch_all(cursor, "DROP INDEX ON :Label(id2);")
def test_large_diff_in_num_vertices_v2(connect):
"""Tests that when one index has > 10x vertices than the other one, it should be chosen no matter avg group size and uniform distribution."""
cursor = connect.cursor()
execute_and_fetch_all(cursor, "FOREACH (i IN range(1, 99) | CREATE (n:Label {id1: 1}));")
execute_and_fetch_all(cursor, "FOREACH (i IN range(1, 1000) | CREATE (n:Label {id2: i}));")
execute_and_fetch_all(cursor, "CREATE INDEX ON :Label(id1);")
execute_and_fetch_all(cursor, "CREATE INDEX ON :Label(id2);")
analyze_graph_results = execute_and_fetch_all(cursor, "ANALYZE GRAPH")
if analyze_graph_results[0][1] == "id1":
first_index = 0
else:
first_index = 1
# Check results
assert analyze_graph_results[first_index] == ("Label", "id1", 99, 1, 99, 0)
assert analyze_graph_results[1 - first_index] == ("Label", "id2", 1000, 1000, 1, 0)
expected_explain_after_analysis = [
(f" * Produce {{n}}",),
(f" * Filter",),
(f" * ScanAllByLabelPropertyValue (n :Label {{id1}})",),
(f" * Once",),
]
assert (
execute_and_fetch_all(cursor, "EXPLAIN MATCH (n:Label) WHERE n.id2 = 3 AND n.id1 = 3 RETURN n;")
== expected_explain_after_analysis
)
assert len(execute_and_fetch_all(cursor, "ANALYZE GRAPH DELETE STATISTICS")) == 2
execute_and_fetch_all(cursor, "DROP INDEX ON :Label(id1);")
execute_and_fetch_all(cursor, "DROP INDEX ON :Label(id2);")
def test_same_avg_group_size_diff_distribution(connect):
"""Tests index choice decision based on key distribution."""
cursor = connect.cursor()
# Setup first key distribution
execute_and_fetch_all(cursor, "FOREACH (i IN range(1, 10) | CREATE (n:Label {id1: 1}));")
execute_and_fetch_all(cursor, "FOREACH (i IN range(1, 30) | CREATE (n:Label {id1: 2}));")
execute_and_fetch_all(cursor, "FOREACH (i IN range(1, 20) | CREATE (n:Label {id1: 3}));")
execute_and_fetch_all(cursor, "FOREACH (i IN range(1, 35) | CREATE (n:Label {id1: 4}));")
execute_and_fetch_all(cursor, "FOREACH (i IN range(1, 5) | CREATE (n:Label {id1: 5}));")
# Setup second key distribution
execute_and_fetch_all(cursor, "FOREACH (i IN range(1, 20) | CREATE (n:Label {id2: 1}));")
execute_and_fetch_all(cursor, "FOREACH (i IN range(1, 20) | CREATE (n:Label {id2: 2}));")
execute_and_fetch_all(cursor, "FOREACH (i IN range(1, 20) | CREATE (n:Label {id2: 3}));")
execute_and_fetch_all(cursor, "FOREACH (i IN range(1, 20) | CREATE (n:Label {id2: 4}));")
execute_and_fetch_all(cursor, "FOREACH (i IN range(1, 20) | CREATE (n:Label {id2: 5}));")
execute_and_fetch_all(cursor, "CREATE INDEX ON :Label(id1);")
execute_and_fetch_all(cursor, "CREATE INDEX ON :Label(id2);")
analyze_graph_results = execute_and_fetch_all(cursor, "ANALYZE GRAPH")
if analyze_graph_results[0][1] == "id1":
first_index = 0
else:
first_index = 1
# Check results
assert analyze_graph_results[first_index] == ("Label", "id1", 100, 5, 20, 32.5)
assert analyze_graph_results[1 - first_index] == ("Label", "id2", 100, 5, 20, 0)
expected_explain_after_analysis = [
(f" * Produce {{n}}",),
(f" * Filter",),
(f" * ScanAllByLabelPropertyValue (n :Label {{id2}})",),
(f" * Once",),
]
assert (
execute_and_fetch_all(cursor, "EXPLAIN MATCH (n:Label) WHERE n.id2 = 3 AND n.id1 = 3 RETURN n;")
== expected_explain_after_analysis
)
assert len(execute_and_fetch_all(cursor, "ANALYZE GRAPH DELETE STATISTICS")) == 2
execute_and_fetch_all(cursor, "DROP INDEX ON :Label(id1);")
execute_and_fetch_all(cursor, "DROP INDEX ON :Label(id2);")
if __name__ == "__main__":
sys.exit(pytest.main([__file__, "-rA"]))

View File

@ -0,0 +1,14 @@
analyze_graph_cluster: &analyze_graph_cluster
cluster:
main:
args: ["--bolt-port", "7687", "--log-level=TRACE"]
log_file: "analyze_graph.log"
setup_queries: []
validation_queries: []
workloads:
- name: "Analyze graph for better indexing"
binary: "tests/e2e/pytest_runner.sh"
args: ["analyze_graph/optimize_indexes.py"]
<<: *analyze_graph_cluster

View File

@ -1,4 +1,4 @@
// Copyright 2022 Memgraph Ltd.
// Copyright 2023 Memgraph Ltd.
//
// Use of this software is governed by the Business Source License
// included in the file licenses/BSL.txt; by using this file, you agree to be bound by the terms of the Business Source
@ -213,6 +213,11 @@ class InteractiveDbAccessor {
return label_property_index_.at(key);
}
std::optional<memgraph::storage::IndexStats> GetIndexStats(memgraph::storage::LabelId label,
memgraph::storage::PropertyId property) const {
return dba_->GetIndexStats(label, property);
}
// Save the cached vertex counts to a stream.
void Save(std::ostream &out) {
out << "vertex-count " << vertices_count_ << std::endl;

View File

@ -11,6 +11,7 @@
#include <gmock/gmock.h>
#include <gtest/gtest.h>
#include <climits>
#include "query/frontend/semantic/symbol_generator.hpp"
#include "query/frontend/semantic/symbol_table.hpp"
@ -459,6 +460,11 @@ class FakeDbAccessor {
return false;
}
memgraph::storage::IndexStats GetIndexStats(memgraph::storage::LabelId label,
memgraph::storage::PropertyId property) const {
return memgraph::storage::IndexStats{.statistic = 0, .avg_group_size = 1}; // unique id
}
void SetIndexCount(memgraph::storage::LabelId label, int64_t count) { label_index_[label] = count; }
void SetIndexCount(memgraph::storage::LabelId label, memgraph::storage::PropertyId property, int64_t count) {

View File

@ -1,4 +1,4 @@
// Copyright 2022 Memgraph Ltd.
// Copyright 2023 Memgraph Ltd.
//
// Use of this software is governed by the Business Source License
// included in the file licenses/BSL.txt; by using this file, you agree to be bound by the terms of the Business Source
@ -10,6 +10,7 @@
// licenses/APL.txt.
#include <cmath>
#include <limits>
#include <gmock/gmock.h>
#include <gtest/gtest.h>
@ -21,3 +22,29 @@ TEST(UtilsMath, Log2) {
ASSERT_EQ(memgraph::utils::Log2(i), static_cast<uint64_t>(log2(i)));
}
}
TEST(UtilsMath, EqualFloat) {
ASSERT_TRUE(memgraph::utils::ApproxEqualDecimal(0.2f, 0.2f));
ASSERT_TRUE(memgraph::utils::ApproxEqualDecimal(0.2f, 0.199999999999f));
ASSERT_TRUE(memgraph::utils::ApproxEqualDecimal(0.2f, 0.200000000001f));
ASSERT_FALSE(memgraph::utils::ApproxEqualDecimal(0.2f, 0.19995f));
}
TEST(UtilsMath, EqualDouble) {
ASSERT_TRUE(memgraph::utils::ApproxEqualDecimal(0.2, 0.2));
ASSERT_TRUE(memgraph::utils::ApproxEqualDecimal(0.2, 0.19999999999999999999));
ASSERT_TRUE(memgraph::utils::ApproxEqualDecimal(0.2, 0.20000000000000000001));
ASSERT_FALSE(memgraph::utils::ApproxEqualDecimal(0.2, 0.19995));
}
TEST(UtilsMath, LessThan) {
ASSERT_TRUE(memgraph::utils::LessThanDecimal(0.2, 0.3));
ASSERT_TRUE(memgraph::utils::LessThanDecimal(0.2, 0.20001));
}
TEST(UtilsMath, ChiSquared) {
ASSERT_EQ(std::numeric_limits<double>::max(), memgraph::utils::ChiSquaredValue(2.0, 0.0));
ASSERT_DOUBLE_EQ(0.0, memgraph::utils::ChiSquaredValue(2.0, 2.0));
ASSERT_DOUBLE_EQ(1.0, memgraph::utils::ChiSquaredValue(2.0, 1.0));
ASSERT_DOUBLE_EQ(1. / 3., memgraph::utils::ChiSquaredValue(4.0, 3.0));
}