diff --git a/src/query/constants.hpp b/src/query/constants.hpp index 55b1eebea..ce70bab41 100644 --- a/src/query/constants.hpp +++ b/src/query/constants.hpp @@ -15,5 +15,5 @@ namespace memgraph::query { inline const std::string kAsterisk = "*"; -inline constexpr uint16_t kDeleteStatisticsNumResults = 6; +inline constexpr uint16_t kComputeStatisticsNumResults = 7; } // namespace memgraph::query diff --git a/src/query/db_accessor.hpp b/src/query/db_accessor.hpp index 564f4b2c4..b80fb0fe3 100644 --- a/src/query/db_accessor.hpp +++ b/src/query/db_accessor.hpp @@ -430,22 +430,36 @@ class DbAccessor final { return accessor_->LabelPropertyIndexExists(label, prop); } - std::optional<storage::IndexStats> GetIndexStats(const storage::LabelId &label, - const storage::PropertyId &property) const { + std::optional<storage::LabelIndexStats> GetIndexStats(const storage::LabelId &label) const { + return accessor_->GetIndexStats(label); + } + + std::optional<storage::LabelPropertyIndexStats> GetIndexStats(const storage::LabelId &label, + const storage::PropertyId &property) const { return accessor_->GetIndexStats(label, property); } - std::vector<std::pair<storage::LabelId, storage::PropertyId>> ClearIndexStats() { - return accessor_->ClearIndexStats(); + std::vector<std::pair<storage::LabelId, storage::PropertyId>> ClearLabelPropertyIndexStats() { + return accessor_->ClearLabelPropertyIndexStats(); } - std::vector<std::pair<storage::LabelId, storage::PropertyId>> DeleteIndexStatsForLabels( + std::vector<storage::LabelId> ClearLabelIndexStats() { return accessor_->ClearLabelIndexStats(); } + + std::vector<std::pair<storage::LabelId, storage::PropertyId>> DeleteLabelPropertyIndexStats( const std::span<std::string> labels) { - return accessor_->DeleteIndexStatsForLabels(labels); + return accessor_->DeleteLabelPropertyIndexStats(labels); + } + + std::vector<storage::LabelId> DeleteLabelIndexStats(const std::span<std::string> labels) { + return accessor_->DeleteLabelIndexStats(labels); + } + + void SetIndexStats(const storage::LabelId &label, const storage::LabelIndexStats &stats) { + accessor_->SetIndexStats(label, stats); } void SetIndexStats(const storage::LabelId &label, const storage::PropertyId &property, - const storage::IndexStats &stats) { + const storage::LabelPropertyIndexStats &stats) { accessor_->SetIndexStats(label, property, stats); } diff --git a/src/query/frontend/semantic/symbol_generator.cpp b/src/query/frontend/semantic/symbol_generator.cpp index c92db6f2c..cde169675 100644 --- a/src/query/frontend/semantic/symbol_generator.cpp +++ b/src/query/frontend/semantic/symbol_generator.cpp @@ -157,25 +157,34 @@ void SymbolGenerator::VisitReturnBody(ReturnBody &body, Where *where) { // Query bool SymbolGenerator::PreVisit(SingleQuery &) { - prev_return_names_ = curr_return_names_; - curr_return_names_.clear(); + auto &scope = scopes_.back(); + + scope.prev_return_names = scope.curr_return_names; + scope.curr_return_names.clear(); return true; } // Union bool SymbolGenerator::PreVisit(CypherUnion &) { - scopes_.back() = Scope(); + auto next_scope = Scope(); + next_scope.curr_return_names = scopes_.back().curr_return_names; + + scopes_.pop_back(); + scopes_.push_back(next_scope); + return true; } bool SymbolGenerator::PostVisit(CypherUnion &cypher_union) { - if (prev_return_names_ != curr_return_names_) { + auto &scope = scopes_.back(); + + if (scope.prev_return_names != scope.curr_return_names) { throw SemanticException("All subqueries in an UNION must have the same column names."); } // create new symbols for the result of the union - for (const auto &name : curr_return_names_) { + for (const auto &name : scope.curr_return_names) { auto symbol = CreateSymbol(name, false); cypher_union.union_symbols_.push_back(symbol); } @@ -259,7 +268,9 @@ bool SymbolGenerator::PreVisit(Return &ret) { } bool SymbolGenerator::PostVisit(Return &) { - for (const auto &name_symbol : scopes_.back().symbols) curr_return_names_.insert(name_symbol.first); + auto &scope = scopes_.back(); + + for (const auto &name_symbol : scope.symbols) scope.curr_return_names.insert(name_symbol.first); return true; } diff --git a/src/query/frontend/semantic/symbol_generator.hpp b/src/query/frontend/semantic/symbol_generator.hpp index 25b8dc648..5b98958c6 100644 --- a/src/query/frontend/semantic/symbol_generator.hpp +++ b/src/query/frontend/semantic/symbol_generator.hpp @@ -140,6 +140,8 @@ class SymbolGenerator : public HierarchicalTreeVisitor { std::vector<Identifier *> identifiers_in_match; // Number of nested IfOperators. int num_if_operators{0}; + std::unordered_set<std::string> prev_return_names{}; + std::unordered_set<std::string> curr_return_names{}; }; static std::optional<Symbol> FindSymbolInScope(const std::string &name, const Scope &scope, Symbol::Type type); @@ -171,8 +173,6 @@ class SymbolGenerator : public HierarchicalTreeVisitor { // is mapped by its name. std::unordered_map<std::string, Identifier *> predefined_identifiers_; std::vector<Scope> scopes_; - std::unordered_set<std::string> prev_return_names_; - std::unordered_set<std::string> curr_return_names_; }; inline SymbolTable MakeSymbolTable(CypherQuery *query, const std::vector<Identifier *> &predefined_identifiers = {}) { diff --git a/src/query/interpreter.cpp b/src/query/interpreter.cpp index a31243385..087fdb09a 100644 --- a/src/query/interpreter.cpp +++ b/src/query/interpreter.cpp @@ -1545,74 +1545,181 @@ PreparedQuery PrepareDumpQuery(ParsedQuery parsed_query, std::map<std::string, T std::vector<std::vector<TypedValue>> AnalyzeGraphQueryHandler::AnalyzeGraphCreateStatistics( const std::span<std::string> labels, DbAccessor *execution_db_accessor) { using LPIndex = std::pair<storage::LabelId, storage::PropertyId>; + auto view = storage::View::OLD; - std::vector<std::vector<TypedValue>> results; - std::map<LPIndex, std::map<storage::PropertyValue, int64_t>> counter; + auto erase_not_specified_label_indices = [&labels, execution_db_accessor](auto &index_info) { + if (labels[0] == kAsterisk) { + return; + } - // Preprocess labels to avoid later checks - std::vector<LPIndex> indices_info = execution_db_accessor->ListAllIndices().label_property; - if (labels[0] != kAsterisk) { - for (auto it = indices_info.cbegin(); it != indices_info.cend();) { - if (std::find(labels.begin(), labels.end(), execution_db_accessor->LabelToName(it->first)) == labels.end()) { - it = indices_info.erase(it); + for (auto it = index_info.cbegin(); it != index_info.cend();) { + if (std::find(labels.begin(), labels.end(), execution_db_accessor->LabelToName(*it)) == labels.end()) { + it = index_info.erase(it); } else { ++it; } } - } - // Iterate over all indexed vertices - std::for_each(indices_info.begin(), indices_info.end(), [execution_db_accessor, &counter](const LPIndex &index_info) { - auto vertices = execution_db_accessor->Vertices(storage::View::OLD, index_info.first, index_info.second); - std::for_each(vertices.begin(), vertices.end(), [&index_info, &counter](const auto &vertex) { - counter[index_info][*vertex.GetProperty(storage::View::OLD, index_info.second)]++; - }); - }); + }; - results.reserve(counter.size()); - std::for_each(counter.begin(), counter.end(), [&results, execution_db_accessor](const auto &counter_entry) { - const auto &[label_property, values_map] = counter_entry; - std::vector<TypedValue> result; - result.reserve(kDeleteStatisticsNumResults); - // Extract info - int64_t count_property_value = std::accumulate( - values_map.begin(), values_map.end(), 0, - [](int64_t prev_value, const auto &prop_value_count) { return prev_value + prop_value_count.second; }); - // num_distinc_values will never be 0 - double avg_group_size = static_cast<double>(count_property_value) / static_cast<double>(values_map.size()); - double chi_squared_stat = std::accumulate( - values_map.begin(), values_map.end(), 0.0, [avg_group_size](double prev_result, const auto &value_entry) { - return prev_result + utils::ChiSquaredValue(value_entry.second, avg_group_size); + auto erase_not_specified_label_property_indices = [&labels, execution_db_accessor](auto &index_info) { + if (labels[0] == kAsterisk) { + return; + } + + for (auto it = index_info.cbegin(); it != index_info.cend();) { + if (std::find(labels.begin(), labels.end(), execution_db_accessor->LabelToName(it->first)) == labels.end()) { + it = index_info.erase(it); + } else { + ++it; + } + } + }; + + auto populate_label_stats = [execution_db_accessor, view](auto index_info) { + std::vector<std::pair<storage::LabelId, storage::LabelIndexStats>> label_stats; + label_stats.reserve(index_info.size()); + std::for_each(index_info.begin(), index_info.end(), + [execution_db_accessor, view, &label_stats](const storage::LabelId &label_id) { + auto vertices = execution_db_accessor->Vertices(view, label_id); + uint64_t no_vertices{0}; + uint64_t total_degree{0}; + std::for_each(vertices.begin(), vertices.end(), + [&total_degree, &no_vertices, &view](const auto &vertex) { + no_vertices++; + total_degree += *vertex.OutDegree(view) + *vertex.InDegree(view); + }); + + auto average_degree = + no_vertices > 0 ? static_cast<double>(total_degree) / static_cast<double>(no_vertices) : 0; + auto index_stats = storage::LabelIndexStats{.count = no_vertices, .avg_degree = average_degree}; + execution_db_accessor->SetIndexStats(label_id, index_stats); + label_stats.emplace_back(label_id, index_stats); + }); + + return label_stats; + }; + + auto populate_label_property_stats = [execution_db_accessor, view](auto &index_info) { + std::map<LPIndex, std::map<storage::PropertyValue, int64_t>> label_property_counter; + std::map<LPIndex, uint64_t> vertex_degree_counter; + // Iterate over all label property indexed vertices + std::for_each( + index_info.begin(), index_info.end(), + [execution_db_accessor, &label_property_counter, &vertex_degree_counter, view](const LPIndex &index_info) { + auto vertices = execution_db_accessor->Vertices(view, index_info.first, index_info.second); + std::for_each(vertices.begin(), vertices.end(), + [&index_info, &label_property_counter, &vertex_degree_counter, &view](const auto &vertex) { + label_property_counter[index_info][*vertex.GetProperty(view, index_info.second)]++; + vertex_degree_counter[index_info] += *vertex.OutDegree(view) + *vertex.InDegree(view); + }); }); - execution_db_accessor->SetIndexStats( - label_property.first, label_property.second, - storage::IndexStats{.statistic = chi_squared_stat, .avg_group_size = avg_group_size}); - // Save result - result.emplace_back(execution_db_accessor->LabelToName(label_property.first)); - result.emplace_back(execution_db_accessor->PropertyToName(label_property.second)); - result.emplace_back(count_property_value); - result.emplace_back(static_cast<int64_t>(values_map.size())); - result.emplace_back(avg_group_size); - result.emplace_back(chi_squared_stat); + + std::vector<std::pair<LPIndex, storage::LabelPropertyIndexStats>> label_property_stats; + label_property_stats.reserve(label_property_counter.size()); + std::for_each( + label_property_counter.begin(), label_property_counter.end(), + [execution_db_accessor, &vertex_degree_counter, &label_property_stats](const auto &counter_entry) { + const auto &[label_property, values_map] = counter_entry; + // Extract info + uint64_t count_property_value = std::accumulate( + values_map.begin(), values_map.end(), 0, + [](uint64_t prev_value, const auto &prop_value_count) { return prev_value + prop_value_count.second; }); + // num_distinc_values will never be 0 + double avg_group_size = static_cast<double>(count_property_value) / static_cast<double>(values_map.size()); + double chi_squared_stat = std::accumulate( + values_map.begin(), values_map.end(), 0.0, [avg_group_size](double prev_result, const auto &value_entry) { + return prev_result + utils::ChiSquaredValue(value_entry.second, avg_group_size); + }); + + double average_degree = count_property_value > 0 + ? static_cast<double>(vertex_degree_counter[label_property]) / + static_cast<double>(count_property_value) + : 0; + + auto index_stats = + storage::LabelPropertyIndexStats{.count = count_property_value, + .distinct_values_count = static_cast<uint64_t>(values_map.size()), + .statistic = chi_squared_stat, + .avg_group_size = avg_group_size, + .avg_degree = average_degree}; + execution_db_accessor->SetIndexStats(label_property.first, label_property.second, index_stats); + label_property_stats.push_back(std::make_pair(label_property, index_stats)); + }); + + return label_property_stats; + }; + + auto index_info = execution_db_accessor->ListAllIndices(); + + std::vector<storage::LabelId> label_indices_info = index_info.label; + erase_not_specified_label_indices(label_indices_info); + auto label_stats = populate_label_stats(label_indices_info); + + std::vector<LPIndex> label_property_indices_info = index_info.label_property; + erase_not_specified_label_property_indices(label_property_indices_info); + auto label_property_stats = populate_label_property_stats(label_property_indices_info); + + std::vector<std::vector<TypedValue>> results; + results.reserve(label_stats.size() + label_property_stats.size()); + + std::for_each(label_stats.begin(), label_stats.end(), [execution_db_accessor, &results](const auto &stat_entry) { + std::vector<TypedValue> result; + result.reserve(kComputeStatisticsNumResults); + + result.emplace_back(execution_db_accessor->LabelToName(stat_entry.first)); + result.emplace_back(TypedValue()); + result.emplace_back(static_cast<int64_t>(stat_entry.second.count)); + result.emplace_back(TypedValue()); + result.emplace_back(TypedValue()); + result.emplace_back(TypedValue()); + result.emplace_back(stat_entry.second.avg_degree); results.push_back(std::move(result)); }); + + std::for_each(label_property_stats.begin(), label_property_stats.end(), + [execution_db_accessor, &results](const auto &stat_entry) { + std::vector<TypedValue> result; + result.reserve(kComputeStatisticsNumResults); + + result.emplace_back(execution_db_accessor->LabelToName(stat_entry.first.first)); + result.emplace_back(execution_db_accessor->PropertyToName(stat_entry.first.second)); + result.emplace_back(static_cast<int64_t>(stat_entry.second.count)); + result.emplace_back(static_cast<int64_t>(stat_entry.second.distinct_values_count)); + result.emplace_back(stat_entry.second.avg_group_size); + result.emplace_back(stat_entry.second.statistic); + result.emplace_back(stat_entry.second.avg_degree); + results.push_back(std::move(result)); + }); + return results; } std::vector<std::vector<TypedValue>> AnalyzeGraphQueryHandler::AnalyzeGraphDeleteStatistics( const std::span<std::string> labels, DbAccessor *execution_db_accessor) { - std::vector<std::pair<storage::LabelId, storage::PropertyId>> loc_results; + std::vector<std::pair<storage::LabelId, storage::PropertyId>> label_prop_results; + std::vector<storage::LabelId> label_results; if (labels[0] == kAsterisk) { - loc_results = execution_db_accessor->ClearIndexStats(); + label_prop_results = execution_db_accessor->ClearLabelPropertyIndexStats(); + label_results = execution_db_accessor->ClearLabelIndexStats(); } else { - loc_results = execution_db_accessor->DeleteIndexStatsForLabels(labels); + label_prop_results = execution_db_accessor->DeleteLabelPropertyIndexStats(labels); + label_results = execution_db_accessor->DeleteLabelIndexStats(labels); } + std::vector<std::vector<TypedValue>> results; - std::transform(loc_results.begin(), loc_results.end(), std::back_inserter(results), + results.reserve(label_prop_results.size() + label_results.size()); + std::transform(label_prop_results.begin(), label_prop_results.end(), std::back_inserter(results), [execution_db_accessor](const auto &label_property_index) { return std::vector<TypedValue>{ TypedValue(execution_db_accessor->LabelToName(label_property_index.first)), TypedValue(execution_db_accessor->PropertyToName(label_property_index.second))}; }); + + std::transform( + label_results.begin(), label_results.end(), std::back_inserter(results), + [execution_db_accessor](const auto &label_index) { + return std::vector<TypedValue>{TypedValue(execution_db_accessor->LabelToName(label_index)), TypedValue("")}; + }); return results; } @@ -1621,7 +1728,8 @@ Callback HandleAnalyzeGraphQuery(AnalyzeGraphQuery *analyze_graph_query, DbAcces switch (analyze_graph_query->action_) { case AnalyzeGraphQuery::Action::ANALYZE: { callback.header = {"label", "property", "num estimation nodes", - "num groups", "avg group size", "chi-squared value"}; + "num groups", "avg group size", "chi-squared value", + "avg degree"}; callback.fn = [handler = AnalyzeGraphQueryHandler(), labels = analyze_graph_query->labels_, execution_db_accessor]() mutable { return handler.AnalyzeGraphCreateStatistics(labels, execution_db_accessor); diff --git a/src/query/plan/cost_estimator.hpp b/src/query/plan/cost_estimator.hpp index f9b71b0d8..3169d2e2f 100644 --- a/src/query/plan/cost_estimator.hpp +++ b/src/query/plan/cost_estimator.hpp @@ -15,9 +15,29 @@ #include "query/parameters.hpp" #include "query/plan/operator.hpp" #include "query/typed_value.hpp" +#include "utils/algorithm.hpp" +#include "utils/math.hpp" namespace memgraph::query::plan { +/** + * The symbol statistics specify essential DB statistics which + * help the query planner (namely here the cost estimator), to decide + * how to do expands and other types of Cypher manipulations. + */ +struct SymbolStatistics { + uint64_t count; + double degree; +}; + +/** + * Scope of the statistics for every scanned symbol in + * the operator tree. + */ +struct Scope { + std::unordered_map<std::string, SymbolStatistics> symbol_stats; +}; + /** * Query plan execution time cost estimator, for comparing and choosing optimal * execution plans. @@ -81,8 +101,11 @@ class CostEstimator : public HierarchicalLogicalOperatorVisitor { using HierarchicalLogicalOperatorVisitor::PostVisit; using HierarchicalLogicalOperatorVisitor::PreVisit; - CostEstimator(TDbAccessor *db_accessor, const Parameters ¶meters) - : db_accessor_(db_accessor), parameters(parameters) {} + CostEstimator(TDbAccessor *db_accessor, const SymbolTable &table, const Parameters ¶meters) + : db_accessor_(db_accessor), table_(table), parameters(parameters), scopes_{Scope()} {} + + CostEstimator(TDbAccessor *db_accessor, const SymbolTable &table, const Parameters ¶meters, Scope scope) + : db_accessor_(db_accessor), table_(table), parameters(parameters), scopes_{scope} {} bool PostVisit(ScanAll &) override { cardinality_ *= db_accessor_->VerticesCount(); @@ -92,6 +115,11 @@ class CostEstimator : public HierarchicalLogicalOperatorVisitor { } bool PostVisit(ScanAllByLabel &scan_all_by_label) override { + auto index_stats = db_accessor_->GetIndexStats(scan_all_by_label.label_); + if (index_stats.has_value()) { + SaveStatsFor(scan_all_by_label.output_symbol_, index_stats.value()); + } + cardinality_ *= db_accessor_->VerticesCount(scan_all_by_label.label_); // ScanAll performs some work for every element that is produced IncrementCost(CostParam::kScanAllByLabel); @@ -102,6 +130,11 @@ class CostEstimator : public HierarchicalLogicalOperatorVisitor { // This cardinality estimation depends on the property value (expression). // If it's a constant, we can evaluate cardinality exactly, otherwise // we estimate + auto index_stats = db_accessor_->GetIndexStats(logical_op.label_, logical_op.property_); + if (index_stats.has_value()) { + SaveStatsFor(logical_op.output_symbol_, index_stats.value()); + } + auto property_value = ConstPropertyValue(logical_op.expression_); double factor = 1.0; if (property_value) @@ -119,6 +152,11 @@ class CostEstimator : public HierarchicalLogicalOperatorVisitor { } bool PostVisit(ScanAllByLabelPropertyRange &logical_op) override { + auto index_stats = db_accessor_->GetIndexStats(logical_op.label_, logical_op.property_); + if (index_stats.has_value()) { + SaveStatsFor(logical_op.output_symbol_, index_stats.value()); + } + // this cardinality estimation depends on Bound expressions. // if they are literals we can evaluate cardinality properly auto lower = BoundToPropertyValue(logical_op.lower_bound_); @@ -144,6 +182,11 @@ class CostEstimator : public HierarchicalLogicalOperatorVisitor { } bool PostVisit(ScanAllByLabelProperty &logical_op) override { + auto index_stats = db_accessor_->GetIndexStats(logical_op.label_, logical_op.property_); + if (index_stats.has_value()) { + SaveStatsFor(logical_op.output_symbol_, index_stats.value()); + } + const auto factor = db_accessor_->VerticesCount(logical_op.label_, logical_op.property_); cardinality_ *= factor; IncrementCost(CostParam::MakeScanAllByLabelProperty); @@ -152,6 +195,20 @@ class CostEstimator : public HierarchicalLogicalOperatorVisitor { // TODO: Cost estimate ScanAllById? + bool PostVisit(Expand &expand) override { + auto card_param = CardParam::kExpand; + auto stats = GetStatsFor(expand.input_symbol_); + + if (stats.has_value()) { + card_param = stats.value().degree; + } + + cardinality_ *= card_param; + IncrementCost(CostParam::kExpand); + + return true; + } + // For the given op first increments the cardinality and then cost. #define POST_VISIT_CARD_FIRST(NAME) \ bool PostVisit(NAME &) override { \ @@ -160,7 +217,6 @@ class CostEstimator : public HierarchicalLogicalOperatorVisitor { return true; \ } - POST_VISIT_CARD_FIRST(Expand); POST_VISIT_CARD_FIRST(ExpandVariable); #undef POST_VISIT_CARD_FIRST @@ -225,20 +281,42 @@ class CostEstimator : public HierarchicalLogicalOperatorVisitor { return false; } + bool PostVisit(Produce &op) override { + auto scope = Scope(); + + // translate all the stats to the scope outside the return + for (const auto &symbol : op.ModifiedSymbols(table_)) { + auto stats = GetStatsFor(symbol); + if (stats.has_value()) { + scope.symbol_stats[symbol.name()] = + SymbolStatistics{.count = stats.value().count, .degree = stats.value().degree}; + } + } + + scopes_.push_back(std::move(scope)); + return true; + } + bool PreVisit(Apply &op) override { - double input_cost = EstimateCostOnBranch(&op.input_); - double subquery_cost = EstimateCostOnBranch(&op.subquery_); + // Get the cost of the main branch + op.input_->Accept(*this); - // if the query is a unit subquery, we don't want the cost to be zero but 1xN - input_cost = input_cost == 0 ? 1 : input_cost; - subquery_cost = subquery_cost == 0 ? 1 : subquery_cost; + // Estimate cost on the subquery branch independently, use a copy + auto &last_scope = scopes_.back(); + double subquery_cost = EstimateCostOnBranch(&op.subquery_, last_scope); + subquery_cost = !utils::ApproxEqualDecimal(subquery_cost, 0.0) ? subquery_cost : 1; + cardinality_ *= subquery_cost; - cardinality_ *= input_cost * subquery_cost; IncrementCost(CostParam::kSubquery); return false; } + bool PostVisit(EmptyResult & /*op*/) override { + scopes_.emplace_back(); + return true; + } + bool Visit(Once &) override { return true; } auto cost() const { return cost_; } @@ -255,12 +333,20 @@ class CostEstimator : public HierarchicalLogicalOperatorVisitor { // accessor used for cardinality estimates in ScanAll and ScanAllByLabel TDbAccessor *db_accessor_; + const SymbolTable &table_; const Parameters ¶meters; + std::vector<Scope> scopes_; void IncrementCost(double param) { cost_ += param * cardinality_; } double EstimateCostOnBranch(std::shared_ptr<LogicalOperator> *branch) { - CostEstimator<TDbAccessor> cost_estimator(db_accessor_, parameters); + CostEstimator<TDbAccessor> cost_estimator(db_accessor_, table_, parameters); + (*branch)->Accept(cost_estimator); + return cost_estimator.cost(); + } + + double EstimateCostOnBranch(std::shared_ptr<LogicalOperator> *branch, Scope scope) { + CostEstimator<TDbAccessor> cost_estimator(db_accessor_, table_, parameters, scope); (*branch)->Accept(cost_estimator); return cost_estimator.cost(); } @@ -287,12 +373,32 @@ class CostEstimator : public HierarchicalLogicalOperatorVisitor { } return std::nullopt; } + + bool HasStatsFor(const Symbol &symbol) const { return utils::Contains(scopes_.back().symbol_stats, symbol.name()); } + + std::optional<SymbolStatistics> GetStatsFor(const Symbol &symbol) { + if (!HasStatsFor(symbol)) { + return std::nullopt; + } + + auto &scope = scopes_.back(); + return scope.symbol_stats[symbol.name()]; + } + + template <typename T> + void SaveStatsFor(const Symbol &symbol, T index_stats) { + scopes_.back().symbol_stats[symbol.name()] = SymbolStatistics{ + .count = index_stats.count, + .degree = index_stats.avg_degree, + }; + } }; /** Returns the estimated cost of the given plan. */ template <class TDbAccessor> -double EstimatePlanCost(TDbAccessor *db, const Parameters ¶meters, LogicalOperator &plan) { - CostEstimator<TDbAccessor> estimator(db, parameters); +double EstimatePlanCost(TDbAccessor *db, const SymbolTable &table, const Parameters ¶meters, + LogicalOperator &plan) { + CostEstimator<TDbAccessor> estimator(db, table, parameters); plan.Accept(estimator); return estimator.cost(); } diff --git a/src/query/plan/planner.hpp b/src/query/plan/planner.hpp index cfc89ec73..443680c37 100644 --- a/src/query/plan/planner.hpp +++ b/src/query/plan/planner.hpp @@ -47,8 +47,9 @@ class PostProcessor final { } template <class TVertexCounts> - double EstimatePlanCost(const std::unique_ptr<LogicalOperator> &plan, TVertexCounts *vertex_counts) { - return query::plan::EstimatePlanCost(vertex_counts, parameters_, *plan); + double EstimatePlanCost(const std::unique_ptr<LogicalOperator> &plan, TVertexCounts *vertex_counts, + const SymbolTable &table) { + return query::plan::EstimatePlanCost(vertex_counts, table, parameters_, *plan); } }; @@ -97,7 +98,7 @@ auto MakeLogicalPlan(TPlanningContext *context, TPlanPostProcess *post_process, // Plans are generated lazily and the current plan will disappear, so // it's ok to move it. auto rewritten_plan = post_process->Rewrite(std::move(plan), context); - double cost = post_process->EstimatePlanCost(rewritten_plan, &vertex_counts); + double cost = post_process->EstimatePlanCost(rewritten_plan, &vertex_counts, *context->symbol_table); if (!curr_plan || cost < total_cost) { curr_plan.emplace(std::move(rewritten_plan)); total_cost = cost; @@ -106,7 +107,7 @@ auto MakeLogicalPlan(TPlanningContext *context, TPlanPostProcess *post_process, } else { auto plan = MakeLogicalPlanForSingleQuery<RuleBasedPlanner>(query_parts, context); auto rewritten_plan = post_process->Rewrite(std::move(plan), context); - total_cost = post_process->EstimatePlanCost(rewritten_plan, &vertex_counts); + total_cost = post_process->EstimatePlanCost(rewritten_plan, &vertex_counts, *context->symbol_table); curr_plan.emplace(std::move(rewritten_plan)); } diff --git a/src/query/plan/rewrite/index_lookup.hpp b/src/query/plan/rewrite/index_lookup.hpp index feac431fe..9e4fc7c9a 100644 --- a/src/query/plan/rewrite/index_lookup.hpp +++ b/src/query/plan/rewrite/index_lookup.hpp @@ -505,7 +505,7 @@ class IndexLookupRewriter final : public HierarchicalLogicalOperatorVisitor { // FilterInfo with PropertyFilter. FilterInfo filter; int64_t vertex_count; - std::optional<storage::IndexStats> index_stats; + std::optional<storage::LabelPropertyIndexStats> index_stats; }; bool DefaultPreVisit() override { throw utils::NotYetImplemented("optimizing index lookup"); } @@ -572,8 +572,8 @@ class IndexLookupRewriter final : public HierarchicalLogicalOperatorVisitor { * @param vertex_count: New index's number of vertices. * @return -1 if the new index is better, 0 if they are equal and 1 if the existing one is better. */ - auto compare_indices = [](std::optional<LabelPropertyIndex> &found, std::optional<storage::IndexStats> &new_stats, - int vertex_count) { + auto compare_indices = [](std::optional<LabelPropertyIndex> &found, + std::optional<storage::LabelPropertyIndexStats> &new_stats, int vertex_count) { if (!new_stats.has_value()) { return 0; } @@ -610,7 +610,8 @@ class IndexLookupRewriter final : public HierarchicalLogicalOperatorVisitor { }; int64_t vertex_count = db_->VerticesCount(GetLabel(label), GetProperty(property)); - std::optional<storage::IndexStats> new_stats = db_->GetIndexStats(GetLabel(label), GetProperty(property)); + std::optional<storage::LabelPropertyIndexStats> new_stats = + db_->GetIndexStats(GetLabel(label), GetProperty(property)); // Conditions, from more to less important: // the index with 10x less vertices is better. diff --git a/src/query/plan/vertex_count_cache.hpp b/src/query/plan/vertex_count_cache.hpp index f00070eaa..ff19ee95a 100644 --- a/src/query/plan/vertex_count_cache.hpp +++ b/src/query/plan/vertex_count_cache.hpp @@ -78,8 +78,12 @@ class VertexCountCache { return db_->LabelPropertyIndexExists(label, property); } - std::optional<storage::IndexStats> GetIndexStats(const storage::LabelId &label, - const storage::PropertyId &property) const { + std::optional<storage::LabelIndexStats> GetIndexStats(const storage::LabelId &label) const { + return db_->GetIndexStats(label); + } + + std::optional<storage::LabelPropertyIndexStats> GetIndexStats(const storage::LabelId &label, + const storage::PropertyId &property) const { return db_->GetIndexStats(label, property); } diff --git a/src/storage/v2/indices.cpp b/src/storage/v2/indices.cpp index ebf022245..596631899 100644 --- a/src/storage/v2/indices.cpp +++ b/src/storage/v2/indices.cpp @@ -478,6 +478,40 @@ void LabelIndex::RunGC() { } } +void LabelIndex::SetIndexStats(const storage::LabelId &label, const storage::LabelIndexStats &stats) { + stats_[label] = stats; +} + +std::optional<LabelIndexStats> LabelIndex::GetIndexStats(const storage::LabelId &label) const { + if (auto it = stats_.find(label); it != stats_.end()) { + return it->second; + } + return {}; +} + +std::vector<LabelId> LabelIndex::ClearIndexStats() { + std::vector<LabelId> deleted_indexes; + deleted_indexes.reserve(stats_.size()); + std::transform(stats_.begin(), stats_.end(), std::back_inserter(deleted_indexes), + [](const auto &elem) { return elem.first; }); + stats_.clear(); + return deleted_indexes; +} + +std::vector<LabelId> LabelIndex::DeleteIndexStats(const storage::LabelId &label) { + std::vector<LabelId> deleted_indexes; + for (auto it = stats_.cbegin(); it != stats_.cend();) { + if (it->first == label) { + deleted_indexes.push_back(it->first); + it = stats_.erase(it); + } else { + ++it; + } + } + + return deleted_indexes; +} + bool LabelPropertyIndex::Entry::operator<(const Entry &rhs) { if (value < rhs.value) { return true; @@ -814,8 +848,7 @@ int64_t LabelPropertyIndex::ApproximateVertexCount(LabelId label, PropertyId pro /* Iterate over all property-label pairs and deletes if label from the index is equal to label parameter. */ -std::vector<std::pair<LabelId, PropertyId>> LabelPropertyIndex::DeleteIndexStatsForLabel( - const storage::LabelId &label) { +std::vector<std::pair<LabelId, PropertyId>> LabelPropertyIndex::DeleteIndexStats(const storage::LabelId &label) { std::vector<std::pair<LabelId, PropertyId>> deleted_indexes; for (auto it = stats_.cbegin(); it != stats_.cend();) { if (it->first.first == label) { @@ -837,14 +870,14 @@ std::vector<std::pair<LabelId, PropertyId>> LabelPropertyIndex::ClearIndexStats( return deleted_indexes; } -void LabelPropertyIndex::SetIndexStats(const storage::LabelId &label, const storage::PropertyId &property, - const IndexStats &stats) { - stats_[{label, property}] = stats; +void LabelPropertyIndex::SetIndexStats(const std::pair<storage::LabelId, storage::PropertyId> &key, + const storage::LabelPropertyIndexStats &stats) { + stats_[key] = stats; } -std::optional<IndexStats> LabelPropertyIndex::GetIndexStats(const storage::LabelId &label, - const storage::PropertyId &property) const { - if (auto it = stats_.find({label, property}); it != stats_.end()) { +std::optional<storage::LabelPropertyIndexStats> LabelPropertyIndex::GetIndexStats( + const std::pair<storage::LabelId, storage::PropertyId> &key) const { + if (auto it = stats_.find(key); it != stats_.end()) { return it->second; } return {}; diff --git a/src/storage/v2/indices.hpp b/src/storage/v2/indices.hpp index 14b023110..b5dc28114 100644 --- a/src/storage/v2/indices.hpp +++ b/src/storage/v2/indices.hpp @@ -31,6 +31,11 @@ struct Constraints; using ParalellizedIndexCreationInfo = std::pair<std::vector<std::pair<Gid, uint64_t>> /*vertex_recovery_info*/, uint64_t /*thread_count*/>; +struct LabelIndexStats { + uint64_t count; + double avg_degree; +}; + class LabelIndex { private: struct Entry { @@ -124,19 +129,29 @@ class LabelIndex { return it->second.size(); } + void SetIndexStats(const storage::LabelId &label, const storage::LabelIndexStats &stats); + + std::optional<storage::LabelIndexStats> GetIndexStats(const storage::LabelId &label) const; + + std::vector<LabelId> ClearIndexStats(); + + std::vector<LabelId> DeleteIndexStats(const storage::LabelId &label); + void Clear() { index_.clear(); } void RunGC(); private: std::map<LabelId, utils::SkipList<Entry>> index_; + std::map<LabelId, storage::LabelIndexStats> stats_; Indices *indices_; Constraints *constraints_; Config::Items config_; }; -struct IndexStats { - double statistic, avg_group_size; +struct LabelPropertyIndexStats { + uint64_t count, distinct_values_count; + double statistic, avg_group_size, avg_degree; }; class LabelPropertyIndex { @@ -248,13 +263,13 @@ class LabelPropertyIndex { std::vector<std::pair<LabelId, PropertyId>> ClearIndexStats(); - std::vector<std::pair<LabelId, PropertyId>> DeleteIndexStatsForLabel(const storage::LabelId &label); + std::vector<std::pair<LabelId, PropertyId>> DeleteIndexStats(const storage::LabelId &label); - void SetIndexStats(const storage::LabelId &label, const storage::PropertyId &property, - const storage::IndexStats &stats); + void SetIndexStats(const std::pair<storage::LabelId, storage::PropertyId> &key, + const storage::LabelPropertyIndexStats &stats); - std::optional<storage::IndexStats> GetIndexStats(const storage::LabelId &label, - const storage::PropertyId &property) const; + std::optional<storage::LabelPropertyIndexStats> GetIndexStats( + const std::pair<storage::LabelId, storage::PropertyId> &key) const; void Clear() { index_.clear(); } @@ -262,7 +277,7 @@ class LabelPropertyIndex { private: std::map<std::pair<LabelId, PropertyId>, utils::SkipList<Entry>> index_; - std::map<std::pair<LabelId, PropertyId>, storage::IndexStats> stats_; + std::map<std::pair<LabelId, PropertyId>, storage::LabelPropertyIndexStats> stats_; Indices *indices_; Constraints *constraints_; Config::Items config_; diff --git a/src/storage/v2/storage.hpp b/src/storage/v2/storage.hpp index 79923f5f7..207ee5290 100644 --- a/src/storage/v2/storage.hpp +++ b/src/storage/v2/storage.hpp @@ -267,28 +267,66 @@ class Storage final { return storage_->indices_.label_property_index.ApproximateVertexCount(label, property, lower, upper); } - std::optional<storage::IndexStats> GetIndexStats(const storage::LabelId &label, - const storage::PropertyId &property) const { - return storage_->indices_.label_property_index.GetIndexStats(label, property); + template <typename TResult, typename TIndex, typename TIndexKey> + std::optional<TResult> GetIndexStatsForIndex(TIndex &index, TIndexKey &&key) const { + return index.GetIndexStats(key); } - std::vector<std::pair<LabelId, PropertyId>> ClearIndexStats() { - return storage_->indices_.label_property_index.ClearIndexStats(); + std::optional<storage::LabelIndexStats> GetIndexStats(const storage::LabelId &label) const { + return GetIndexStatsForIndex<storage::LabelIndexStats>(storage_->indices_.label_index, label); } - std::vector<std::pair<LabelId, PropertyId>> DeleteIndexStatsForLabels(const std::span<std::string> labels) { - std::vector<std::pair<LabelId, PropertyId>> deleted_indexes; - std::for_each(labels.begin(), labels.end(), [this, &deleted_indexes](const auto &label_str) { - std::vector<std::pair<LabelId, PropertyId>> loc_results = - storage_->indices_.label_property_index.DeleteIndexStatsForLabel(NameToLabel(label_str)); + std::optional<storage::LabelPropertyIndexStats> GetIndexStats(const storage::LabelId &label, + const storage::PropertyId &property) const { + return GetIndexStatsForIndex<storage::LabelPropertyIndexStats>(storage_->indices_.label_property_index, + std::make_pair(label, property)); + } + + template <typename TIndex, typename TIndexKey, typename TIndexStats> + void SetIndexStatsForIndex(TIndex &index, TIndexKey &&key, TIndexStats &stats) const { + index.SetIndexStats(key, stats); + } + + void SetIndexStats(const storage::LabelId &label, const LabelIndexStats &stats) { + SetIndexStatsForIndex(storage_->indices_.label_index, label, stats); + } + + void SetIndexStats(const storage::LabelId &label, const storage::PropertyId &property, + const LabelPropertyIndexStats &stats) { + SetIndexStatsForIndex(storage_->indices_.label_property_index, std::make_pair(label, property), stats); + } + + template <typename TResult, typename TIndex> + std::vector<TResult> ClearIndexStatsForIndex(TIndex &index) const { + return index.ClearIndexStats(); + } + + std::vector<std::pair<LabelId, PropertyId>> ClearLabelPropertyIndexStats() { + return ClearIndexStatsForIndex<std::pair<LabelId, PropertyId>>(storage_->indices_.label_property_index); + } + + std::vector<LabelId> ClearLabelIndexStats() { + return ClearIndexStatsForIndex<LabelId>(storage_->indices_.label_index); + } + + template <typename TResult, typename TIndex> + std::vector<TResult> DeleteIndexStatsForIndex(TIndex &index, const std::span<std::string> labels) { + std::vector<TResult> deleted_indexes; + + for (const auto &label : labels) { + std::vector<TResult> loc_results = index.DeleteIndexStats(NameToLabel(label)); deleted_indexes.insert(deleted_indexes.end(), std::make_move_iterator(loc_results.begin()), std::make_move_iterator(loc_results.end())); - }); + } return deleted_indexes; } - void SetIndexStats(const storage::LabelId &label, const storage::PropertyId &property, const IndexStats &stats) { - storage_->indices_.label_property_index.SetIndexStats(label, property, stats); + std::vector<std::pair<LabelId, PropertyId>> DeleteLabelPropertyIndexStats(const std::span<std::string> labels) { + return DeleteIndexStatsForIndex<std::pair<LabelId, PropertyId>>(storage_->indices_.label_property_index, labels); + } + + std::vector<LabelId> DeleteLabelIndexStats(const std::span<std::string> labels) { + return DeleteIndexStatsForIndex<LabelId>(storage_->indices_.label_index, labels); } /// @return Accessor to the deleted vertex if a deletion took place, std::nullopt otherwise diff --git a/tests/benchmark/query/planner.cpp b/tests/benchmark/query/planner.cpp index b73258a87..4fe4ee28c 100644 --- a/tests/benchmark/query/planner.cpp +++ b/tests/benchmark/query/planner.cpp @@ -131,7 +131,7 @@ static void BM_PlanAndEstimateIndexedMatching(benchmark::State &state) { auto plans = memgraph::query::plan::MakeLogicalPlanForSingleQuery<memgraph::query::plan::VariableStartPlanner>( query_parts, &ctx); for (auto plan : plans) { - memgraph::query::plan::EstimatePlanCost(&dba, parameters, *plan); + memgraph::query::plan::EstimatePlanCost(&dba, symbol_table, parameters, *plan); } } } @@ -161,7 +161,7 @@ static void BM_PlanAndEstimateIndexedMatchingWithCachedCounts(benchmark::State & auto plans = memgraph::query::plan::MakeLogicalPlanForSingleQuery<memgraph::query::plan::VariableStartPlanner>( query_parts, &ctx); for (auto plan : plans) { - memgraph::query::plan::EstimatePlanCost(&vertex_counts, parameters, *plan); + memgraph::query::plan::EstimatePlanCost(&vertex_counts, symbol_table, parameters, *plan); } } } diff --git a/tests/e2e/analyze_graph/common.py b/tests/e2e/analyze_graph/common.py index d8b0cabf7..c43d9cd79 100644 --- a/tests/e2e/analyze_graph/common.py +++ b/tests/e2e/analyze_graph/common.py @@ -13,6 +13,7 @@ import typing import mgclient import pytest +from gqlalchemy import Memgraph def execute_and_fetch_all(cursor: mgclient.Cursor, query: str, params: dict = {}) -> typing.List[tuple]: @@ -27,3 +28,14 @@ def connect(**kwargs) -> mgclient.Connection: yield connection cursor = connection.cursor() execute_and_fetch_all(cursor, "MATCH (n) DETACH DELETE n") + + +@pytest.fixture +def memgraph(**kwargs) -> Memgraph: + memgraph = Memgraph() + + yield memgraph + + memgraph.drop_database() + memgraph.execute("analyze graph delete statistics;") + memgraph.drop_indexes() diff --git a/tests/e2e/analyze_graph/optimize_indexes.py b/tests/e2e/analyze_graph/optimize_indexes.py index 82290c673..be6a72c1e 100644 --- a/tests/e2e/analyze_graph/optimize_indexes.py +++ b/tests/e2e/analyze_graph/optimize_indexes.py @@ -12,7 +12,10 @@ import sys import pytest -from common import connect, execute_and_fetch_all +from common import connect, execute_and_fetch_all, memgraph + +QUERY_PLAN = "QUERY PLAN" + # E2E tests for checking query semantic # ------------------------------------ @@ -96,8 +99,8 @@ def test_analyze_full_graph(analyze_query, connect): else: first_index = 1 # Check results - assert analyze_graph_results[first_index] == ("Label", "id1", 100, 100, 1, 0) - assert analyze_graph_results[1 - first_index] == ("Label", "id2", 50, 5, 10, 0) + assert analyze_graph_results[first_index] == ("Label", "id1", 100, 100, 1, 0, 0) + assert analyze_graph_results[1 - first_index] == ("Label", "id2", 50, 5, 10, 0, 0) # After analyzing graph, id1 index should be chosen because it has smaller average group size expected_explain_after_analysis = [ (f" * Produce {{n}}",), @@ -131,8 +134,8 @@ def test_cardinality_different_avg_group_size_uniform_dist(connect): else: first_index = 1 # Check results - assert analyze_graph_results[first_index] == ("Label", "id1", 100, 100, 1, 0) - assert analyze_graph_results[1 - first_index] == ("Label", "id2", 100, 20, 5, 0) + assert analyze_graph_results[first_index] == ("Label", "id1", 100, 100, 1, 0, 0) + assert analyze_graph_results[1 - first_index] == ("Label", "id2", 100, 20, 5, 0, 0) expected_explain_after_analysis = [ (f" * Produce {{n}}",), (f" * Filter",), @@ -161,8 +164,8 @@ def test_cardinality_same_avg_group_size_uniform_dist_diff_vertex_count(connect) else: first_index = 1 # Check results - assert analyze_graph_results[first_index] == ("Label", "id1", 100, 100, 1, 0) - assert analyze_graph_results[1 - first_index] == ("Label", "id2", 50, 50, 1, 0) + assert analyze_graph_results[first_index] == ("Label", "id1", 100, 100, 1, 0, 0) + assert analyze_graph_results[1 - first_index] == ("Label", "id2", 50, 50, 1, 0, 0) expected_explain_after_analysis = [ (f" * Produce {{n}}",), (f" * Filter",), @@ -191,8 +194,8 @@ def test_large_diff_in_num_vertices_v1(connect): else: first_index = 1 # Check results - assert analyze_graph_results[first_index] == ("Label", "id1", 1000, 1000, 1, 0) - assert analyze_graph_results[1 - first_index] == ("Label", "id2", 99, 1, 99, 0) + assert analyze_graph_results[first_index] == ("Label", "id1", 1000, 1000, 1, 0, 0) + assert analyze_graph_results[1 - first_index] == ("Label", "id2", 99, 1, 99, 0, 0) expected_explain_after_analysis = [ (f" * Produce {{n}}",), (f" * Filter",), @@ -221,8 +224,8 @@ def test_large_diff_in_num_vertices_v2(connect): else: first_index = 1 # Check results - assert analyze_graph_results[first_index] == ("Label", "id1", 99, 1, 99, 0) - assert analyze_graph_results[1 - first_index] == ("Label", "id2", 1000, 1000, 1, 0) + assert analyze_graph_results[first_index] == ("Label", "id1", 99, 1, 99, 0, 0) + assert analyze_graph_results[1 - first_index] == ("Label", "id2", 1000, 1000, 1, 0, 0) expected_explain_after_analysis = [ (f" * Produce {{n}}",), (f" * Filter",), @@ -261,8 +264,8 @@ def test_same_avg_group_size_diff_distribution(connect): else: first_index = 1 # Check results - assert analyze_graph_results[first_index] == ("Label", "id1", 100, 5, 20, 32.5) - assert analyze_graph_results[1 - first_index] == ("Label", "id2", 100, 5, 20, 0) + assert analyze_graph_results[first_index] == ("Label", "id1", 100, 5, 20, 32.5, 0) + assert analyze_graph_results[1 - first_index] == ("Label", "id2", 100, 5, 20, 0, 0) expected_explain_after_analysis = [ (f" * Produce {{n}}",), (f" * Filter",), @@ -278,5 +281,194 @@ def test_same_avg_group_size_diff_distribution(connect): execute_and_fetch_all(cursor, "DROP INDEX ON :Label(id2);") +def test_given_supernode_when_expanding_then_expand_other_way_around(memgraph): + memgraph.execute("FOREACH (i in range(1, 1000) | CREATE (:Node {id: i}));") + memgraph.execute("CREATE (:SuperNode {id: 1});") + memgraph.execute("CREATE INDEX ON :SuperNode(id);") + memgraph.execute("CREATE INDEX ON :SuperNode;") + memgraph.execute("CREATE INDEX ON :Node(id);") + memgraph.execute("CREATE INDEX ON :Node;") + memgraph.execute("match (n:Node) match (s:SuperNode {id: 1}) merge (n)<-[:HAS_REL_TO]-(s);") + + query = "explain match (n:Node) match (s:SuperNode {id: 1}) merge (n)<-[:HAS_REL_TO]-(s);" + expected_explain = [ + f" * EmptyResult", + f" * Merge", + f" |\\ On Match", + f" | * Expand (s)-[anon3:HAS_REL_TO]->(n)", + f" | * Once", + f" |\\ On Create", + f" | * CreateExpand (n)<-[anon3:HAS_REL_TO]-(s)", + f" | * Once", + f" * ScanAllByLabel (n :Node)", + f" * ScanAllByLabelPropertyValue (s :SuperNode {{id}})", + f" * Once", + ] + + result_without_analysis = list(memgraph.execute_and_fetch(query)) + result_without_analysis = [x[QUERY_PLAN] for x in result_without_analysis] + assert expected_explain == result_without_analysis + + memgraph.execute("analyze graph;") + + expected_explain = [ + x.replace(f" | * Expand (s)-[anon3:HAS_REL_TO]->(n)", f" | * Expand (n)<-[anon3:HAS_REL_TO]-(s)") + for x in expected_explain + ] + + result_with_analysis = list(memgraph.execute_and_fetch(query)) + result_with_analysis = [x[QUERY_PLAN] for x in result_with_analysis] + + assert expected_explain == result_with_analysis + + +def test_given_supernode_when_subquery_then_carry_information_to_subquery(memgraph): + memgraph.execute("FOREACH (i in range(1, 1000) | CREATE (:Node {id: i}));") + memgraph.execute("FOREACH (i in range(1, 1000) | CREATE (:Node2 {id: i}));") + memgraph.execute("CREATE (:SuperNode {id: 1});") + memgraph.execute("CREATE INDEX ON :SuperNode(id);") + memgraph.execute("CREATE INDEX ON :SuperNode;") + memgraph.execute("CREATE INDEX ON :Node(id);") + memgraph.execute("CREATE INDEX ON :Node;") + memgraph.execute("CREATE INDEX ON :Node2(id);") + memgraph.execute("CREATE INDEX ON :Node2;") + + memgraph.execute("match (n:Node) match (s:SuperNode {id: 1}) merge (n)<-[:HAS_REL_TO]-(s);") + memgraph.execute("match (n:Node2) match (s:SuperNode {id: 1}) merge (n)<-[:HAS_REL_TO]-(s);") + + query = ( + "explain match (n:Node) match (s:SuperNode {id: 1}) call { with n, s merge (n)<-[:HAS_REL_TO]-(s) } return 1" + ) + expected_explain = [ + f" * Produce {{0}}", + f" * Accumulate", + f" * Accumulate", + f" * Apply", + f" |\\ ", + f" | * EmptyResult", + f" | * Merge", + f" | |\\ On Match", + f" | | * Expand (s)-[anon3:HAS_REL_TO]->(n)", + f" | | * Once", + f" | |\\ On Create", + f" | | * CreateExpand (n)<-[anon3:HAS_REL_TO]-(s)", + f" | | * Once", + f" | * Produce {{n, s}}", + f" | * Once", + f" * ScanAllByLabel (n :Node)", + f" * ScanAllByLabelPropertyValue (s :SuperNode {{id}})", + f" * Once", + ] + + result_without_analysis = list(memgraph.execute_and_fetch(query)) + result_without_analysis = [x[QUERY_PLAN] for x in result_without_analysis] + assert expected_explain == result_without_analysis + + memgraph.execute("analyze graph;") + + expected_explain = [ + x.replace(f" | | * Expand (s)-[anon3:HAS_REL_TO]->(n)", f" | | * Expand (n)<-[anon3:HAS_REL_TO]-(s)") + for x in expected_explain + ] + result_with_analysis = list(memgraph.execute_and_fetch(query)) + result_with_analysis = [x[QUERY_PLAN] for x in result_with_analysis] + + assert expected_explain == result_with_analysis + + +def test_given_supernode_when_subquery_and_union_then_carry_information(memgraph): + memgraph.execute("FOREACH (i in range(1, 1000) | CREATE (:Node {id: i}));") + memgraph.execute("FOREACH (i in range(1, 1000) | CREATE (:Node2 {id: i}));") + memgraph.execute("CREATE (:SuperNode {id: 1});") + memgraph.execute("CREATE INDEX ON :SuperNode(id);") + memgraph.execute("CREATE INDEX ON :SuperNode;") + memgraph.execute("CREATE INDEX ON :Node(id);") + memgraph.execute("CREATE INDEX ON :Node;") + memgraph.execute("CREATE INDEX ON :Node2(id);") + memgraph.execute("CREATE INDEX ON :Node2;") + + memgraph.execute("match (n:Node) match (s:SuperNode {id: 1}) merge (n)<-[:HAS_REL_TO]-(s);") + memgraph.execute("match (n:Node2) match (s:SuperNode {id: 1}) merge (n)<-[:HAS_REL_TO]-(s);") + + query = "explain match (n:Node) match (s:SuperNode {id: 1}) call { with n, s merge (n)<-[:HAS_REL_TO]-(s) } return s union all match (n:Node) match (s:SuperNode {id: 1}) call { with n, s merge (n)<-[:HAS_REL_TO]-(s) } return s;" + expected_explain = [ + f" * Union {{s : s}}", + f" |\\ ", + f" | * Produce {{s}}", + f" | * Accumulate", + f" | * Accumulate", + f" | * Apply", + f" | |\\ ", + f" | | * EmptyResult", + f" | | * Merge", + f" | | |\\ On Match", + f" | | | * Expand (s)-[anon7:HAS_REL_TO]->(n)", + f" | | | * Once", + f" | | |\\ On Create", + f" | | | * CreateExpand (n)<-[anon7:HAS_REL_TO]-(s)", + f" | | | * Once", + f" | | * Produce {{n, s}}", + f" | | * Once", + f" | * ScanAllByLabel (n :Node)", + f" | * ScanAllByLabelPropertyValue (s :SuperNode {{id}})", + f" | * Once", + f" * Produce {{s}}", + f" * Accumulate", + f" * Accumulate", + f" * Apply", + f" |\\ ", + f" | * EmptyResult", + f" | * Merge", + f" | |\\ On Match", + f" | | * Expand (s)-[anon3:HAS_REL_TO]->(n)", + f" | | * Once", + f" | |\\ On Create", + f" | | * CreateExpand (n)<-[anon3:HAS_REL_TO]-(s)", + f" | | * Once", + f" | * Produce {{n, s}}", + f" | * Once", + f" * ScanAllByLabel (n :Node)", + f" * ScanAllByLabelPropertyValue (s :SuperNode {{id}})", + f" * Once", + ] + + result_without_analysis = list(memgraph.execute_and_fetch(query)) + result_without_analysis = [x[QUERY_PLAN] for x in result_without_analysis] + assert expected_explain == result_without_analysis + + memgraph.execute("analyze graph;") + + expected_explain = [ + x.replace(f" | | * Expand (s)-[anon3:HAS_REL_TO]->(n)", f" | | * Expand (n)<-[anon3:HAS_REL_TO]-(s)") + for x in expected_explain + ] + expected_explain = [ + x.replace(f" | | | * Expand (s)-[anon7:HAS_REL_TO]->(n)", f" | | | * Expand (n)<-[anon7:HAS_REL_TO]-(s)") + for x in expected_explain + ] + result_with_analysis = list(memgraph.execute_and_fetch(query)) + result_with_analysis = [x[QUERY_PLAN] for x in result_with_analysis] + + assert expected_explain == result_with_analysis + + +def test_given_empty_graph_when_analyzing_graph_return_zero_degree(memgraph): + memgraph.execute("CREATE INDEX ON :Node;") + + label_stats = next(memgraph.execute_and_fetch("analyze graph;")) + + expected_analysis = { + "label": "Node", + "property": None, + "num estimation nodes": 0, + "num groups": None, + "avg group size": None, + "chi-squared value": None, + "avg degree": 0.0, + } + + assert set(label_stats) == set(expected_analysis) + + if __name__ == "__main__": sys.exit(pytest.main([__file__, "-rA"])) diff --git a/tests/manual/interactive_planning.cpp b/tests/manual/interactive_planning.cpp index 3772b3b9d..a8b66ff26 100644 --- a/tests/manual/interactive_planning.cpp +++ b/tests/manual/interactive_planning.cpp @@ -27,6 +27,7 @@ #include "query/plan/planner.hpp" #include "query/plan/pretty_print.hpp" #include "query/typed_value.hpp" +#include "storage/v2/indices.hpp" #include "storage/v2/property_value.hpp" #include "utils/string.hpp" @@ -213,8 +214,12 @@ class InteractiveDbAccessor { return label_property_index_.at(key); } - std::optional<memgraph::storage::IndexStats> GetIndexStats(memgraph::storage::LabelId label, - memgraph::storage::PropertyId property) const { + std::optional<memgraph::storage::LabelIndexStats> GetIndexStats(const memgraph::storage::LabelId label) const { + return dba_->GetIndexStats(label); + } + + std::optional<memgraph::storage::LabelPropertyIndexStats> GetIndexStats( + const memgraph::storage::LabelId label, const memgraph::storage::PropertyId property) const { return dba_->GetIndexStats(label, property); } @@ -458,7 +463,7 @@ auto MakeLogicalPlans(memgraph::query::CypherQuery *query, memgraph::query::AstS memgraph::query::AstStorage ast_copy; auto unoptimized_plan = plan->Clone(&ast_copy); auto rewritten_plan = post_process.Rewrite(std::move(plan), &ctx); - double cost = post_process.EstimatePlanCost(rewritten_plan, dba); + double cost = post_process.EstimatePlanCost(rewritten_plan, dba, symbol_table); interactive_plans.push_back( InteractivePlan{std::move(unoptimized_plan), std::move(ast_copy), std::move(rewritten_plan), cost}); } diff --git a/tests/unit/query_cost_estimator.cpp b/tests/unit/query_cost_estimator.cpp index 2253ce0f2..d89031b82 100644 --- a/tests/unit/query_cost_estimator.cpp +++ b/tests/unit/query_cost_estimator.cpp @@ -74,7 +74,7 @@ class QueryCostEstimator : public ::testing::Test { } auto Cost() { - CostEstimator<memgraph::query::DbAccessor> cost_estimator(&*dba, parameters_); + CostEstimator<memgraph::query::DbAccessor> cost_estimator(&*dba, symbol_table_, parameters_); last_op_->Accept(cost_estimator); return cost_estimator.cost(); } @@ -201,7 +201,7 @@ TEST_F(QueryCostEstimator, SubqueryCartesian) { std::shared_ptr<LogicalOperator> input = std::make_shared<ScanAll>(std::make_shared<Once>(), NextSymbol()); std::shared_ptr<LogicalOperator> subquery = std::make_shared<ScanAll>(std::make_shared<Once>(), NextSymbol()); MakeOp<memgraph::query::plan::Apply>(input, subquery, true); - EXPECT_COST(CostParam::kSubquery * no_vertices * no_vertices); + EXPECT_COST(CostParam::kSubquery * no_vertices * no_vertices + no_vertices); } TEST_F(QueryCostEstimator, UnitSubquery) { diff --git a/tests/unit/query_plan_checker.hpp b/tests/unit/query_plan_checker.hpp index 64a5b0471..0a8b4d3ab 100644 --- a/tests/unit/query_plan_checker.hpp +++ b/tests/unit/query_plan_checker.hpp @@ -500,9 +500,13 @@ class FakeDbAccessor { return false; } - memgraph::storage::IndexStats GetIndexStats(memgraph::storage::LabelId label, - memgraph::storage::PropertyId property) const { - return memgraph::storage::IndexStats{.statistic = 0, .avg_group_size = 1}; // unique id + std::optional<memgraph::storage::LabelPropertyIndexStats> GetIndexStats( + const memgraph::storage::LabelId label, const memgraph::storage::PropertyId property) const { + return memgraph::storage::LabelPropertyIndexStats{.statistic = 0, .avg_group_size = 1}; // unique id + } + + std::optional<memgraph::storage::LabelIndexStats> GetIndexStats(const memgraph::storage::LabelId label) const { + return memgraph::storage::LabelIndexStats{.count = 0, .avg_degree = 0}; // unique id } void SetIndexCount(memgraph::storage::LabelId label, int64_t count) { label_index_[label] = count; } diff --git a/tests/unit/query_semantic.cpp b/tests/unit/query_semantic.cpp index be3fd60d9..ee18380b2 100644 --- a/tests/unit/query_semantic.cpp +++ b/tests/unit/query_semantic.cpp @@ -1252,4 +1252,11 @@ TEST_F(TestSymbolGenerator, Subqueries) { query = QUERY(SINGLE_QUERY(MATCH(PATTERN(NODE("n"))), CALL_SUBQUERY(subquery), RETURN("n", "m"))); symbol_table = MakeSymbolTable(query); ASSERT_EQ(symbol_table.max_position(), 11); + + // MATCH (n) CALL { MATCH (s) RETURN s } RETURN n UNION MATCH (n) CALL { MATCH (s) RETURN s } RETURN n + subquery = QUERY(SINGLE_QUERY(MATCH(PATTERN(NODE("s"))), RETURN("s"))); + query = QUERY(SINGLE_QUERY(MATCH(PATTERN(NODE("n"))), CALL_SUBQUERY(subquery), RETURN("n")), + UNION(SINGLE_QUERY(MATCH(PATTERN(NODE("n"))), CALL_SUBQUERY(subquery), RETURN("n")))); + symbol_table = MakeSymbolTable(query); + ASSERT_EQ(symbol_table.max_position(), 13); }