Add vertex degree to index statistics (#1026)

Add graph analysis of vertex degrees when doing ANALYZE GRAPH.
This commit is contained in:
Josipmrden 2023-06-27 18:06:20 +02:00 committed by GitHub
parent 261aa4f49b
commit 84721f7e0a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
19 changed files with 686 additions and 135 deletions

View File

@ -15,5 +15,5 @@
namespace memgraph::query {
inline const std::string kAsterisk = "*";
inline constexpr uint16_t kDeleteStatisticsNumResults = 6;
inline constexpr uint16_t kComputeStatisticsNumResults = 7;
} // namespace memgraph::query

View File

@ -430,22 +430,36 @@ class DbAccessor final {
return accessor_->LabelPropertyIndexExists(label, prop);
}
std::optional<storage::IndexStats> GetIndexStats(const storage::LabelId &label,
const storage::PropertyId &property) const {
std::optional<storage::LabelIndexStats> GetIndexStats(const storage::LabelId &label) const {
return accessor_->GetIndexStats(label);
}
std::optional<storage::LabelPropertyIndexStats> GetIndexStats(const storage::LabelId &label,
const storage::PropertyId &property) const {
return accessor_->GetIndexStats(label, property);
}
std::vector<std::pair<storage::LabelId, storage::PropertyId>> ClearIndexStats() {
return accessor_->ClearIndexStats();
std::vector<std::pair<storage::LabelId, storage::PropertyId>> ClearLabelPropertyIndexStats() {
return accessor_->ClearLabelPropertyIndexStats();
}
std::vector<std::pair<storage::LabelId, storage::PropertyId>> DeleteIndexStatsForLabels(
std::vector<storage::LabelId> ClearLabelIndexStats() { return accessor_->ClearLabelIndexStats(); }
std::vector<std::pair<storage::LabelId, storage::PropertyId>> DeleteLabelPropertyIndexStats(
const std::span<std::string> labels) {
return accessor_->DeleteIndexStatsForLabels(labels);
return accessor_->DeleteLabelPropertyIndexStats(labels);
}
std::vector<storage::LabelId> DeleteLabelIndexStats(const std::span<std::string> labels) {
return accessor_->DeleteLabelIndexStats(labels);
}
void SetIndexStats(const storage::LabelId &label, const storage::LabelIndexStats &stats) {
accessor_->SetIndexStats(label, stats);
}
void SetIndexStats(const storage::LabelId &label, const storage::PropertyId &property,
const storage::IndexStats &stats) {
const storage::LabelPropertyIndexStats &stats) {
accessor_->SetIndexStats(label, property, stats);
}

View File

@ -157,25 +157,34 @@ void SymbolGenerator::VisitReturnBody(ReturnBody &body, Where *where) {
// Query
bool SymbolGenerator::PreVisit(SingleQuery &) {
prev_return_names_ = curr_return_names_;
curr_return_names_.clear();
auto &scope = scopes_.back();
scope.prev_return_names = scope.curr_return_names;
scope.curr_return_names.clear();
return true;
}
// Union
bool SymbolGenerator::PreVisit(CypherUnion &) {
scopes_.back() = Scope();
auto next_scope = Scope();
next_scope.curr_return_names = scopes_.back().curr_return_names;
scopes_.pop_back();
scopes_.push_back(next_scope);
return true;
}
bool SymbolGenerator::PostVisit(CypherUnion &cypher_union) {
if (prev_return_names_ != curr_return_names_) {
auto &scope = scopes_.back();
if (scope.prev_return_names != scope.curr_return_names) {
throw SemanticException("All subqueries in an UNION must have the same column names.");
}
// create new symbols for the result of the union
for (const auto &name : curr_return_names_) {
for (const auto &name : scope.curr_return_names) {
auto symbol = CreateSymbol(name, false);
cypher_union.union_symbols_.push_back(symbol);
}
@ -259,7 +268,9 @@ bool SymbolGenerator::PreVisit(Return &ret) {
}
bool SymbolGenerator::PostVisit(Return &) {
for (const auto &name_symbol : scopes_.back().symbols) curr_return_names_.insert(name_symbol.first);
auto &scope = scopes_.back();
for (const auto &name_symbol : scope.symbols) scope.curr_return_names.insert(name_symbol.first);
return true;
}

View File

@ -140,6 +140,8 @@ class SymbolGenerator : public HierarchicalTreeVisitor {
std::vector<Identifier *> identifiers_in_match;
// Number of nested IfOperators.
int num_if_operators{0};
std::unordered_set<std::string> prev_return_names{};
std::unordered_set<std::string> curr_return_names{};
};
static std::optional<Symbol> FindSymbolInScope(const std::string &name, const Scope &scope, Symbol::Type type);
@ -171,8 +173,6 @@ class SymbolGenerator : public HierarchicalTreeVisitor {
// is mapped by its name.
std::unordered_map<std::string, Identifier *> predefined_identifiers_;
std::vector<Scope> scopes_;
std::unordered_set<std::string> prev_return_names_;
std::unordered_set<std::string> curr_return_names_;
};
inline SymbolTable MakeSymbolTable(CypherQuery *query, const std::vector<Identifier *> &predefined_identifiers = {}) {

View File

@ -1545,74 +1545,181 @@ PreparedQuery PrepareDumpQuery(ParsedQuery parsed_query, std::map<std::string, T
std::vector<std::vector<TypedValue>> AnalyzeGraphQueryHandler::AnalyzeGraphCreateStatistics(
const std::span<std::string> labels, DbAccessor *execution_db_accessor) {
using LPIndex = std::pair<storage::LabelId, storage::PropertyId>;
auto view = storage::View::OLD;
std::vector<std::vector<TypedValue>> results;
std::map<LPIndex, std::map<storage::PropertyValue, int64_t>> counter;
auto erase_not_specified_label_indices = [&labels, execution_db_accessor](auto &index_info) {
if (labels[0] == kAsterisk) {
return;
}
// Preprocess labels to avoid later checks
std::vector<LPIndex> indices_info = execution_db_accessor->ListAllIndices().label_property;
if (labels[0] != kAsterisk) {
for (auto it = indices_info.cbegin(); it != indices_info.cend();) {
if (std::find(labels.begin(), labels.end(), execution_db_accessor->LabelToName(it->first)) == labels.end()) {
it = indices_info.erase(it);
for (auto it = index_info.cbegin(); it != index_info.cend();) {
if (std::find(labels.begin(), labels.end(), execution_db_accessor->LabelToName(*it)) == labels.end()) {
it = index_info.erase(it);
} else {
++it;
}
}
}
// Iterate over all indexed vertices
std::for_each(indices_info.begin(), indices_info.end(), [execution_db_accessor, &counter](const LPIndex &index_info) {
auto vertices = execution_db_accessor->Vertices(storage::View::OLD, index_info.first, index_info.second);
std::for_each(vertices.begin(), vertices.end(), [&index_info, &counter](const auto &vertex) {
counter[index_info][*vertex.GetProperty(storage::View::OLD, index_info.second)]++;
});
});
};
results.reserve(counter.size());
std::for_each(counter.begin(), counter.end(), [&results, execution_db_accessor](const auto &counter_entry) {
const auto &[label_property, values_map] = counter_entry;
std::vector<TypedValue> result;
result.reserve(kDeleteStatisticsNumResults);
// Extract info
int64_t count_property_value = std::accumulate(
values_map.begin(), values_map.end(), 0,
[](int64_t prev_value, const auto &prop_value_count) { return prev_value + prop_value_count.second; });
// num_distinc_values will never be 0
double avg_group_size = static_cast<double>(count_property_value) / static_cast<double>(values_map.size());
double chi_squared_stat = std::accumulate(
values_map.begin(), values_map.end(), 0.0, [avg_group_size](double prev_result, const auto &value_entry) {
return prev_result + utils::ChiSquaredValue(value_entry.second, avg_group_size);
auto erase_not_specified_label_property_indices = [&labels, execution_db_accessor](auto &index_info) {
if (labels[0] == kAsterisk) {
return;
}
for (auto it = index_info.cbegin(); it != index_info.cend();) {
if (std::find(labels.begin(), labels.end(), execution_db_accessor->LabelToName(it->first)) == labels.end()) {
it = index_info.erase(it);
} else {
++it;
}
}
};
auto populate_label_stats = [execution_db_accessor, view](auto index_info) {
std::vector<std::pair<storage::LabelId, storage::LabelIndexStats>> label_stats;
label_stats.reserve(index_info.size());
std::for_each(index_info.begin(), index_info.end(),
[execution_db_accessor, view, &label_stats](const storage::LabelId &label_id) {
auto vertices = execution_db_accessor->Vertices(view, label_id);
uint64_t no_vertices{0};
uint64_t total_degree{0};
std::for_each(vertices.begin(), vertices.end(),
[&total_degree, &no_vertices, &view](const auto &vertex) {
no_vertices++;
total_degree += *vertex.OutDegree(view) + *vertex.InDegree(view);
});
auto average_degree =
no_vertices > 0 ? static_cast<double>(total_degree) / static_cast<double>(no_vertices) : 0;
auto index_stats = storage::LabelIndexStats{.count = no_vertices, .avg_degree = average_degree};
execution_db_accessor->SetIndexStats(label_id, index_stats);
label_stats.emplace_back(label_id, index_stats);
});
return label_stats;
};
auto populate_label_property_stats = [execution_db_accessor, view](auto &index_info) {
std::map<LPIndex, std::map<storage::PropertyValue, int64_t>> label_property_counter;
std::map<LPIndex, uint64_t> vertex_degree_counter;
// Iterate over all label property indexed vertices
std::for_each(
index_info.begin(), index_info.end(),
[execution_db_accessor, &label_property_counter, &vertex_degree_counter, view](const LPIndex &index_info) {
auto vertices = execution_db_accessor->Vertices(view, index_info.first, index_info.second);
std::for_each(vertices.begin(), vertices.end(),
[&index_info, &label_property_counter, &vertex_degree_counter, &view](const auto &vertex) {
label_property_counter[index_info][*vertex.GetProperty(view, index_info.second)]++;
vertex_degree_counter[index_info] += *vertex.OutDegree(view) + *vertex.InDegree(view);
});
});
execution_db_accessor->SetIndexStats(
label_property.first, label_property.second,
storage::IndexStats{.statistic = chi_squared_stat, .avg_group_size = avg_group_size});
// Save result
result.emplace_back(execution_db_accessor->LabelToName(label_property.first));
result.emplace_back(execution_db_accessor->PropertyToName(label_property.second));
result.emplace_back(count_property_value);
result.emplace_back(static_cast<int64_t>(values_map.size()));
result.emplace_back(avg_group_size);
result.emplace_back(chi_squared_stat);
std::vector<std::pair<LPIndex, storage::LabelPropertyIndexStats>> label_property_stats;
label_property_stats.reserve(label_property_counter.size());
std::for_each(
label_property_counter.begin(), label_property_counter.end(),
[execution_db_accessor, &vertex_degree_counter, &label_property_stats](const auto &counter_entry) {
const auto &[label_property, values_map] = counter_entry;
// Extract info
uint64_t count_property_value = std::accumulate(
values_map.begin(), values_map.end(), 0,
[](uint64_t prev_value, const auto &prop_value_count) { return prev_value + prop_value_count.second; });
// num_distinc_values will never be 0
double avg_group_size = static_cast<double>(count_property_value) / static_cast<double>(values_map.size());
double chi_squared_stat = std::accumulate(
values_map.begin(), values_map.end(), 0.0, [avg_group_size](double prev_result, const auto &value_entry) {
return prev_result + utils::ChiSquaredValue(value_entry.second, avg_group_size);
});
double average_degree = count_property_value > 0
? static_cast<double>(vertex_degree_counter[label_property]) /
static_cast<double>(count_property_value)
: 0;
auto index_stats =
storage::LabelPropertyIndexStats{.count = count_property_value,
.distinct_values_count = static_cast<uint64_t>(values_map.size()),
.statistic = chi_squared_stat,
.avg_group_size = avg_group_size,
.avg_degree = average_degree};
execution_db_accessor->SetIndexStats(label_property.first, label_property.second, index_stats);
label_property_stats.push_back(std::make_pair(label_property, index_stats));
});
return label_property_stats;
};
auto index_info = execution_db_accessor->ListAllIndices();
std::vector<storage::LabelId> label_indices_info = index_info.label;
erase_not_specified_label_indices(label_indices_info);
auto label_stats = populate_label_stats(label_indices_info);
std::vector<LPIndex> label_property_indices_info = index_info.label_property;
erase_not_specified_label_property_indices(label_property_indices_info);
auto label_property_stats = populate_label_property_stats(label_property_indices_info);
std::vector<std::vector<TypedValue>> results;
results.reserve(label_stats.size() + label_property_stats.size());
std::for_each(label_stats.begin(), label_stats.end(), [execution_db_accessor, &results](const auto &stat_entry) {
std::vector<TypedValue> result;
result.reserve(kComputeStatisticsNumResults);
result.emplace_back(execution_db_accessor->LabelToName(stat_entry.first));
result.emplace_back(TypedValue());
result.emplace_back(static_cast<int64_t>(stat_entry.second.count));
result.emplace_back(TypedValue());
result.emplace_back(TypedValue());
result.emplace_back(TypedValue());
result.emplace_back(stat_entry.second.avg_degree);
results.push_back(std::move(result));
});
std::for_each(label_property_stats.begin(), label_property_stats.end(),
[execution_db_accessor, &results](const auto &stat_entry) {
std::vector<TypedValue> result;
result.reserve(kComputeStatisticsNumResults);
result.emplace_back(execution_db_accessor->LabelToName(stat_entry.first.first));
result.emplace_back(execution_db_accessor->PropertyToName(stat_entry.first.second));
result.emplace_back(static_cast<int64_t>(stat_entry.second.count));
result.emplace_back(static_cast<int64_t>(stat_entry.second.distinct_values_count));
result.emplace_back(stat_entry.second.avg_group_size);
result.emplace_back(stat_entry.second.statistic);
result.emplace_back(stat_entry.second.avg_degree);
results.push_back(std::move(result));
});
return results;
}
std::vector<std::vector<TypedValue>> AnalyzeGraphQueryHandler::AnalyzeGraphDeleteStatistics(
const std::span<std::string> labels, DbAccessor *execution_db_accessor) {
std::vector<std::pair<storage::LabelId, storage::PropertyId>> loc_results;
std::vector<std::pair<storage::LabelId, storage::PropertyId>> label_prop_results;
std::vector<storage::LabelId> label_results;
if (labels[0] == kAsterisk) {
loc_results = execution_db_accessor->ClearIndexStats();
label_prop_results = execution_db_accessor->ClearLabelPropertyIndexStats();
label_results = execution_db_accessor->ClearLabelIndexStats();
} else {
loc_results = execution_db_accessor->DeleteIndexStatsForLabels(labels);
label_prop_results = execution_db_accessor->DeleteLabelPropertyIndexStats(labels);
label_results = execution_db_accessor->DeleteLabelIndexStats(labels);
}
std::vector<std::vector<TypedValue>> results;
std::transform(loc_results.begin(), loc_results.end(), std::back_inserter(results),
results.reserve(label_prop_results.size() + label_results.size());
std::transform(label_prop_results.begin(), label_prop_results.end(), std::back_inserter(results),
[execution_db_accessor](const auto &label_property_index) {
return std::vector<TypedValue>{
TypedValue(execution_db_accessor->LabelToName(label_property_index.first)),
TypedValue(execution_db_accessor->PropertyToName(label_property_index.second))};
});
std::transform(
label_results.begin(), label_results.end(), std::back_inserter(results),
[execution_db_accessor](const auto &label_index) {
return std::vector<TypedValue>{TypedValue(execution_db_accessor->LabelToName(label_index)), TypedValue("")};
});
return results;
}
@ -1621,7 +1728,8 @@ Callback HandleAnalyzeGraphQuery(AnalyzeGraphQuery *analyze_graph_query, DbAcces
switch (analyze_graph_query->action_) {
case AnalyzeGraphQuery::Action::ANALYZE: {
callback.header = {"label", "property", "num estimation nodes",
"num groups", "avg group size", "chi-squared value"};
"num groups", "avg group size", "chi-squared value",
"avg degree"};
callback.fn = [handler = AnalyzeGraphQueryHandler(), labels = analyze_graph_query->labels_,
execution_db_accessor]() mutable {
return handler.AnalyzeGraphCreateStatistics(labels, execution_db_accessor);

View File

@ -15,9 +15,29 @@
#include "query/parameters.hpp"
#include "query/plan/operator.hpp"
#include "query/typed_value.hpp"
#include "utils/algorithm.hpp"
#include "utils/math.hpp"
namespace memgraph::query::plan {
/**
* The symbol statistics specify essential DB statistics which
* help the query planner (namely here the cost estimator), to decide
* how to do expands and other types of Cypher manipulations.
*/
struct SymbolStatistics {
uint64_t count;
double degree;
};
/**
* Scope of the statistics for every scanned symbol in
* the operator tree.
*/
struct Scope {
std::unordered_map<std::string, SymbolStatistics> symbol_stats;
};
/**
* Query plan execution time cost estimator, for comparing and choosing optimal
* execution plans.
@ -81,8 +101,11 @@ class CostEstimator : public HierarchicalLogicalOperatorVisitor {
using HierarchicalLogicalOperatorVisitor::PostVisit;
using HierarchicalLogicalOperatorVisitor::PreVisit;
CostEstimator(TDbAccessor *db_accessor, const Parameters &parameters)
: db_accessor_(db_accessor), parameters(parameters) {}
CostEstimator(TDbAccessor *db_accessor, const SymbolTable &table, const Parameters &parameters)
: db_accessor_(db_accessor), table_(table), parameters(parameters), scopes_{Scope()} {}
CostEstimator(TDbAccessor *db_accessor, const SymbolTable &table, const Parameters &parameters, Scope scope)
: db_accessor_(db_accessor), table_(table), parameters(parameters), scopes_{scope} {}
bool PostVisit(ScanAll &) override {
cardinality_ *= db_accessor_->VerticesCount();
@ -92,6 +115,11 @@ class CostEstimator : public HierarchicalLogicalOperatorVisitor {
}
bool PostVisit(ScanAllByLabel &scan_all_by_label) override {
auto index_stats = db_accessor_->GetIndexStats(scan_all_by_label.label_);
if (index_stats.has_value()) {
SaveStatsFor(scan_all_by_label.output_symbol_, index_stats.value());
}
cardinality_ *= db_accessor_->VerticesCount(scan_all_by_label.label_);
// ScanAll performs some work for every element that is produced
IncrementCost(CostParam::kScanAllByLabel);
@ -102,6 +130,11 @@ class CostEstimator : public HierarchicalLogicalOperatorVisitor {
// This cardinality estimation depends on the property value (expression).
// If it's a constant, we can evaluate cardinality exactly, otherwise
// we estimate
auto index_stats = db_accessor_->GetIndexStats(logical_op.label_, logical_op.property_);
if (index_stats.has_value()) {
SaveStatsFor(logical_op.output_symbol_, index_stats.value());
}
auto property_value = ConstPropertyValue(logical_op.expression_);
double factor = 1.0;
if (property_value)
@ -119,6 +152,11 @@ class CostEstimator : public HierarchicalLogicalOperatorVisitor {
}
bool PostVisit(ScanAllByLabelPropertyRange &logical_op) override {
auto index_stats = db_accessor_->GetIndexStats(logical_op.label_, logical_op.property_);
if (index_stats.has_value()) {
SaveStatsFor(logical_op.output_symbol_, index_stats.value());
}
// this cardinality estimation depends on Bound expressions.
// if they are literals we can evaluate cardinality properly
auto lower = BoundToPropertyValue(logical_op.lower_bound_);
@ -144,6 +182,11 @@ class CostEstimator : public HierarchicalLogicalOperatorVisitor {
}
bool PostVisit(ScanAllByLabelProperty &logical_op) override {
auto index_stats = db_accessor_->GetIndexStats(logical_op.label_, logical_op.property_);
if (index_stats.has_value()) {
SaveStatsFor(logical_op.output_symbol_, index_stats.value());
}
const auto factor = db_accessor_->VerticesCount(logical_op.label_, logical_op.property_);
cardinality_ *= factor;
IncrementCost(CostParam::MakeScanAllByLabelProperty);
@ -152,6 +195,20 @@ class CostEstimator : public HierarchicalLogicalOperatorVisitor {
// TODO: Cost estimate ScanAllById?
bool PostVisit(Expand &expand) override {
auto card_param = CardParam::kExpand;
auto stats = GetStatsFor(expand.input_symbol_);
if (stats.has_value()) {
card_param = stats.value().degree;
}
cardinality_ *= card_param;
IncrementCost(CostParam::kExpand);
return true;
}
// For the given op first increments the cardinality and then cost.
#define POST_VISIT_CARD_FIRST(NAME) \
bool PostVisit(NAME &) override { \
@ -160,7 +217,6 @@ class CostEstimator : public HierarchicalLogicalOperatorVisitor {
return true; \
}
POST_VISIT_CARD_FIRST(Expand);
POST_VISIT_CARD_FIRST(ExpandVariable);
#undef POST_VISIT_CARD_FIRST
@ -225,20 +281,42 @@ class CostEstimator : public HierarchicalLogicalOperatorVisitor {
return false;
}
bool PostVisit(Produce &op) override {
auto scope = Scope();
// translate all the stats to the scope outside the return
for (const auto &symbol : op.ModifiedSymbols(table_)) {
auto stats = GetStatsFor(symbol);
if (stats.has_value()) {
scope.symbol_stats[symbol.name()] =
SymbolStatistics{.count = stats.value().count, .degree = stats.value().degree};
}
}
scopes_.push_back(std::move(scope));
return true;
}
bool PreVisit(Apply &op) override {
double input_cost = EstimateCostOnBranch(&op.input_);
double subquery_cost = EstimateCostOnBranch(&op.subquery_);
// Get the cost of the main branch
op.input_->Accept(*this);
// if the query is a unit subquery, we don't want the cost to be zero but 1xN
input_cost = input_cost == 0 ? 1 : input_cost;
subquery_cost = subquery_cost == 0 ? 1 : subquery_cost;
// Estimate cost on the subquery branch independently, use a copy
auto &last_scope = scopes_.back();
double subquery_cost = EstimateCostOnBranch(&op.subquery_, last_scope);
subquery_cost = !utils::ApproxEqualDecimal(subquery_cost, 0.0) ? subquery_cost : 1;
cardinality_ *= subquery_cost;
cardinality_ *= input_cost * subquery_cost;
IncrementCost(CostParam::kSubquery);
return false;
}
bool PostVisit(EmptyResult & /*op*/) override {
scopes_.emplace_back();
return true;
}
bool Visit(Once &) override { return true; }
auto cost() const { return cost_; }
@ -255,12 +333,20 @@ class CostEstimator : public HierarchicalLogicalOperatorVisitor {
// accessor used for cardinality estimates in ScanAll and ScanAllByLabel
TDbAccessor *db_accessor_;
const SymbolTable &table_;
const Parameters &parameters;
std::vector<Scope> scopes_;
void IncrementCost(double param) { cost_ += param * cardinality_; }
double EstimateCostOnBranch(std::shared_ptr<LogicalOperator> *branch) {
CostEstimator<TDbAccessor> cost_estimator(db_accessor_, parameters);
CostEstimator<TDbAccessor> cost_estimator(db_accessor_, table_, parameters);
(*branch)->Accept(cost_estimator);
return cost_estimator.cost();
}
double EstimateCostOnBranch(std::shared_ptr<LogicalOperator> *branch, Scope scope) {
CostEstimator<TDbAccessor> cost_estimator(db_accessor_, table_, parameters, scope);
(*branch)->Accept(cost_estimator);
return cost_estimator.cost();
}
@ -287,12 +373,32 @@ class CostEstimator : public HierarchicalLogicalOperatorVisitor {
}
return std::nullopt;
}
bool HasStatsFor(const Symbol &symbol) const { return utils::Contains(scopes_.back().symbol_stats, symbol.name()); }
std::optional<SymbolStatistics> GetStatsFor(const Symbol &symbol) {
if (!HasStatsFor(symbol)) {
return std::nullopt;
}
auto &scope = scopes_.back();
return scope.symbol_stats[symbol.name()];
}
template <typename T>
void SaveStatsFor(const Symbol &symbol, T index_stats) {
scopes_.back().symbol_stats[symbol.name()] = SymbolStatistics{
.count = index_stats.count,
.degree = index_stats.avg_degree,
};
}
};
/** Returns the estimated cost of the given plan. */
template <class TDbAccessor>
double EstimatePlanCost(TDbAccessor *db, const Parameters &parameters, LogicalOperator &plan) {
CostEstimator<TDbAccessor> estimator(db, parameters);
double EstimatePlanCost(TDbAccessor *db, const SymbolTable &table, const Parameters &parameters,
LogicalOperator &plan) {
CostEstimator<TDbAccessor> estimator(db, table, parameters);
plan.Accept(estimator);
return estimator.cost();
}

View File

@ -47,8 +47,9 @@ class PostProcessor final {
}
template <class TVertexCounts>
double EstimatePlanCost(const std::unique_ptr<LogicalOperator> &plan, TVertexCounts *vertex_counts) {
return query::plan::EstimatePlanCost(vertex_counts, parameters_, *plan);
double EstimatePlanCost(const std::unique_ptr<LogicalOperator> &plan, TVertexCounts *vertex_counts,
const SymbolTable &table) {
return query::plan::EstimatePlanCost(vertex_counts, table, parameters_, *plan);
}
};
@ -97,7 +98,7 @@ auto MakeLogicalPlan(TPlanningContext *context, TPlanPostProcess *post_process,
// Plans are generated lazily and the current plan will disappear, so
// it's ok to move it.
auto rewritten_plan = post_process->Rewrite(std::move(plan), context);
double cost = post_process->EstimatePlanCost(rewritten_plan, &vertex_counts);
double cost = post_process->EstimatePlanCost(rewritten_plan, &vertex_counts, *context->symbol_table);
if (!curr_plan || cost < total_cost) {
curr_plan.emplace(std::move(rewritten_plan));
total_cost = cost;
@ -106,7 +107,7 @@ auto MakeLogicalPlan(TPlanningContext *context, TPlanPostProcess *post_process,
} else {
auto plan = MakeLogicalPlanForSingleQuery<RuleBasedPlanner>(query_parts, context);
auto rewritten_plan = post_process->Rewrite(std::move(plan), context);
total_cost = post_process->EstimatePlanCost(rewritten_plan, &vertex_counts);
total_cost = post_process->EstimatePlanCost(rewritten_plan, &vertex_counts, *context->symbol_table);
curr_plan.emplace(std::move(rewritten_plan));
}

View File

@ -505,7 +505,7 @@ class IndexLookupRewriter final : public HierarchicalLogicalOperatorVisitor {
// FilterInfo with PropertyFilter.
FilterInfo filter;
int64_t vertex_count;
std::optional<storage::IndexStats> index_stats;
std::optional<storage::LabelPropertyIndexStats> index_stats;
};
bool DefaultPreVisit() override { throw utils::NotYetImplemented("optimizing index lookup"); }
@ -572,8 +572,8 @@ class IndexLookupRewriter final : public HierarchicalLogicalOperatorVisitor {
* @param vertex_count: New index's number of vertices.
* @return -1 if the new index is better, 0 if they are equal and 1 if the existing one is better.
*/
auto compare_indices = [](std::optional<LabelPropertyIndex> &found, std::optional<storage::IndexStats> &new_stats,
int vertex_count) {
auto compare_indices = [](std::optional<LabelPropertyIndex> &found,
std::optional<storage::LabelPropertyIndexStats> &new_stats, int vertex_count) {
if (!new_stats.has_value()) {
return 0;
}
@ -610,7 +610,8 @@ class IndexLookupRewriter final : public HierarchicalLogicalOperatorVisitor {
};
int64_t vertex_count = db_->VerticesCount(GetLabel(label), GetProperty(property));
std::optional<storage::IndexStats> new_stats = db_->GetIndexStats(GetLabel(label), GetProperty(property));
std::optional<storage::LabelPropertyIndexStats> new_stats =
db_->GetIndexStats(GetLabel(label), GetProperty(property));
// Conditions, from more to less important:
// the index with 10x less vertices is better.

View File

@ -78,8 +78,12 @@ class VertexCountCache {
return db_->LabelPropertyIndexExists(label, property);
}
std::optional<storage::IndexStats> GetIndexStats(const storage::LabelId &label,
const storage::PropertyId &property) const {
std::optional<storage::LabelIndexStats> GetIndexStats(const storage::LabelId &label) const {
return db_->GetIndexStats(label);
}
std::optional<storage::LabelPropertyIndexStats> GetIndexStats(const storage::LabelId &label,
const storage::PropertyId &property) const {
return db_->GetIndexStats(label, property);
}

View File

@ -478,6 +478,40 @@ void LabelIndex::RunGC() {
}
}
void LabelIndex::SetIndexStats(const storage::LabelId &label, const storage::LabelIndexStats &stats) {
stats_[label] = stats;
}
std::optional<LabelIndexStats> LabelIndex::GetIndexStats(const storage::LabelId &label) const {
if (auto it = stats_.find(label); it != stats_.end()) {
return it->second;
}
return {};
}
std::vector<LabelId> LabelIndex::ClearIndexStats() {
std::vector<LabelId> deleted_indexes;
deleted_indexes.reserve(stats_.size());
std::transform(stats_.begin(), stats_.end(), std::back_inserter(deleted_indexes),
[](const auto &elem) { return elem.first; });
stats_.clear();
return deleted_indexes;
}
std::vector<LabelId> LabelIndex::DeleteIndexStats(const storage::LabelId &label) {
std::vector<LabelId> deleted_indexes;
for (auto it = stats_.cbegin(); it != stats_.cend();) {
if (it->first == label) {
deleted_indexes.push_back(it->first);
it = stats_.erase(it);
} else {
++it;
}
}
return deleted_indexes;
}
bool LabelPropertyIndex::Entry::operator<(const Entry &rhs) {
if (value < rhs.value) {
return true;
@ -814,8 +848,7 @@ int64_t LabelPropertyIndex::ApproximateVertexCount(LabelId label, PropertyId pro
/*
Iterate over all property-label pairs and deletes if label from the index is equal to label parameter.
*/
std::vector<std::pair<LabelId, PropertyId>> LabelPropertyIndex::DeleteIndexStatsForLabel(
const storage::LabelId &label) {
std::vector<std::pair<LabelId, PropertyId>> LabelPropertyIndex::DeleteIndexStats(const storage::LabelId &label) {
std::vector<std::pair<LabelId, PropertyId>> deleted_indexes;
for (auto it = stats_.cbegin(); it != stats_.cend();) {
if (it->first.first == label) {
@ -837,14 +870,14 @@ std::vector<std::pair<LabelId, PropertyId>> LabelPropertyIndex::ClearIndexStats(
return deleted_indexes;
}
void LabelPropertyIndex::SetIndexStats(const storage::LabelId &label, const storage::PropertyId &property,
const IndexStats &stats) {
stats_[{label, property}] = stats;
void LabelPropertyIndex::SetIndexStats(const std::pair<storage::LabelId, storage::PropertyId> &key,
const storage::LabelPropertyIndexStats &stats) {
stats_[key] = stats;
}
std::optional<IndexStats> LabelPropertyIndex::GetIndexStats(const storage::LabelId &label,
const storage::PropertyId &property) const {
if (auto it = stats_.find({label, property}); it != stats_.end()) {
std::optional<storage::LabelPropertyIndexStats> LabelPropertyIndex::GetIndexStats(
const std::pair<storage::LabelId, storage::PropertyId> &key) const {
if (auto it = stats_.find(key); it != stats_.end()) {
return it->second;
}
return {};

View File

@ -31,6 +31,11 @@ struct Constraints;
using ParalellizedIndexCreationInfo =
std::pair<std::vector<std::pair<Gid, uint64_t>> /*vertex_recovery_info*/, uint64_t /*thread_count*/>;
struct LabelIndexStats {
uint64_t count;
double avg_degree;
};
class LabelIndex {
private:
struct Entry {
@ -124,19 +129,29 @@ class LabelIndex {
return it->second.size();
}
void SetIndexStats(const storage::LabelId &label, const storage::LabelIndexStats &stats);
std::optional<storage::LabelIndexStats> GetIndexStats(const storage::LabelId &label) const;
std::vector<LabelId> ClearIndexStats();
std::vector<LabelId> DeleteIndexStats(const storage::LabelId &label);
void Clear() { index_.clear(); }
void RunGC();
private:
std::map<LabelId, utils::SkipList<Entry>> index_;
std::map<LabelId, storage::LabelIndexStats> stats_;
Indices *indices_;
Constraints *constraints_;
Config::Items config_;
};
struct IndexStats {
double statistic, avg_group_size;
struct LabelPropertyIndexStats {
uint64_t count, distinct_values_count;
double statistic, avg_group_size, avg_degree;
};
class LabelPropertyIndex {
@ -248,13 +263,13 @@ class LabelPropertyIndex {
std::vector<std::pair<LabelId, PropertyId>> ClearIndexStats();
std::vector<std::pair<LabelId, PropertyId>> DeleteIndexStatsForLabel(const storage::LabelId &label);
std::vector<std::pair<LabelId, PropertyId>> DeleteIndexStats(const storage::LabelId &label);
void SetIndexStats(const storage::LabelId &label, const storage::PropertyId &property,
const storage::IndexStats &stats);
void SetIndexStats(const std::pair<storage::LabelId, storage::PropertyId> &key,
const storage::LabelPropertyIndexStats &stats);
std::optional<storage::IndexStats> GetIndexStats(const storage::LabelId &label,
const storage::PropertyId &property) const;
std::optional<storage::LabelPropertyIndexStats> GetIndexStats(
const std::pair<storage::LabelId, storage::PropertyId> &key) const;
void Clear() { index_.clear(); }
@ -262,7 +277,7 @@ class LabelPropertyIndex {
private:
std::map<std::pair<LabelId, PropertyId>, utils::SkipList<Entry>> index_;
std::map<std::pair<LabelId, PropertyId>, storage::IndexStats> stats_;
std::map<std::pair<LabelId, PropertyId>, storage::LabelPropertyIndexStats> stats_;
Indices *indices_;
Constraints *constraints_;
Config::Items config_;

View File

@ -267,28 +267,66 @@ class Storage final {
return storage_->indices_.label_property_index.ApproximateVertexCount(label, property, lower, upper);
}
std::optional<storage::IndexStats> GetIndexStats(const storage::LabelId &label,
const storage::PropertyId &property) const {
return storage_->indices_.label_property_index.GetIndexStats(label, property);
template <typename TResult, typename TIndex, typename TIndexKey>
std::optional<TResult> GetIndexStatsForIndex(TIndex &index, TIndexKey &&key) const {
return index.GetIndexStats(key);
}
std::vector<std::pair<LabelId, PropertyId>> ClearIndexStats() {
return storage_->indices_.label_property_index.ClearIndexStats();
std::optional<storage::LabelIndexStats> GetIndexStats(const storage::LabelId &label) const {
return GetIndexStatsForIndex<storage::LabelIndexStats>(storage_->indices_.label_index, label);
}
std::vector<std::pair<LabelId, PropertyId>> DeleteIndexStatsForLabels(const std::span<std::string> labels) {
std::vector<std::pair<LabelId, PropertyId>> deleted_indexes;
std::for_each(labels.begin(), labels.end(), [this, &deleted_indexes](const auto &label_str) {
std::vector<std::pair<LabelId, PropertyId>> loc_results =
storage_->indices_.label_property_index.DeleteIndexStatsForLabel(NameToLabel(label_str));
std::optional<storage::LabelPropertyIndexStats> GetIndexStats(const storage::LabelId &label,
const storage::PropertyId &property) const {
return GetIndexStatsForIndex<storage::LabelPropertyIndexStats>(storage_->indices_.label_property_index,
std::make_pair(label, property));
}
template <typename TIndex, typename TIndexKey, typename TIndexStats>
void SetIndexStatsForIndex(TIndex &index, TIndexKey &&key, TIndexStats &stats) const {
index.SetIndexStats(key, stats);
}
void SetIndexStats(const storage::LabelId &label, const LabelIndexStats &stats) {
SetIndexStatsForIndex(storage_->indices_.label_index, label, stats);
}
void SetIndexStats(const storage::LabelId &label, const storage::PropertyId &property,
const LabelPropertyIndexStats &stats) {
SetIndexStatsForIndex(storage_->indices_.label_property_index, std::make_pair(label, property), stats);
}
template <typename TResult, typename TIndex>
std::vector<TResult> ClearIndexStatsForIndex(TIndex &index) const {
return index.ClearIndexStats();
}
std::vector<std::pair<LabelId, PropertyId>> ClearLabelPropertyIndexStats() {
return ClearIndexStatsForIndex<std::pair<LabelId, PropertyId>>(storage_->indices_.label_property_index);
}
std::vector<LabelId> ClearLabelIndexStats() {
return ClearIndexStatsForIndex<LabelId>(storage_->indices_.label_index);
}
template <typename TResult, typename TIndex>
std::vector<TResult> DeleteIndexStatsForIndex(TIndex &index, const std::span<std::string> labels) {
std::vector<TResult> deleted_indexes;
for (const auto &label : labels) {
std::vector<TResult> loc_results = index.DeleteIndexStats(NameToLabel(label));
deleted_indexes.insert(deleted_indexes.end(), std::make_move_iterator(loc_results.begin()),
std::make_move_iterator(loc_results.end()));
});
}
return deleted_indexes;
}
void SetIndexStats(const storage::LabelId &label, const storage::PropertyId &property, const IndexStats &stats) {
storage_->indices_.label_property_index.SetIndexStats(label, property, stats);
std::vector<std::pair<LabelId, PropertyId>> DeleteLabelPropertyIndexStats(const std::span<std::string> labels) {
return DeleteIndexStatsForIndex<std::pair<LabelId, PropertyId>>(storage_->indices_.label_property_index, labels);
}
std::vector<LabelId> DeleteLabelIndexStats(const std::span<std::string> labels) {
return DeleteIndexStatsForIndex<LabelId>(storage_->indices_.label_index, labels);
}
/// @return Accessor to the deleted vertex if a deletion took place, std::nullopt otherwise

View File

@ -131,7 +131,7 @@ static void BM_PlanAndEstimateIndexedMatching(benchmark::State &state) {
auto plans = memgraph::query::plan::MakeLogicalPlanForSingleQuery<memgraph::query::plan::VariableStartPlanner>(
query_parts, &ctx);
for (auto plan : plans) {
memgraph::query::plan::EstimatePlanCost(&dba, parameters, *plan);
memgraph::query::plan::EstimatePlanCost(&dba, symbol_table, parameters, *plan);
}
}
}
@ -161,7 +161,7 @@ static void BM_PlanAndEstimateIndexedMatchingWithCachedCounts(benchmark::State &
auto plans = memgraph::query::plan::MakeLogicalPlanForSingleQuery<memgraph::query::plan::VariableStartPlanner>(
query_parts, &ctx);
for (auto plan : plans) {
memgraph::query::plan::EstimatePlanCost(&vertex_counts, parameters, *plan);
memgraph::query::plan::EstimatePlanCost(&vertex_counts, symbol_table, parameters, *plan);
}
}
}

View File

@ -13,6 +13,7 @@ import typing
import mgclient
import pytest
from gqlalchemy import Memgraph
def execute_and_fetch_all(cursor: mgclient.Cursor, query: str, params: dict = {}) -> typing.List[tuple]:
@ -27,3 +28,14 @@ def connect(**kwargs) -> mgclient.Connection:
yield connection
cursor = connection.cursor()
execute_and_fetch_all(cursor, "MATCH (n) DETACH DELETE n")
@pytest.fixture
def memgraph(**kwargs) -> Memgraph:
memgraph = Memgraph()
yield memgraph
memgraph.drop_database()
memgraph.execute("analyze graph delete statistics;")
memgraph.drop_indexes()

View File

@ -12,7 +12,10 @@
import sys
import pytest
from common import connect, execute_and_fetch_all
from common import connect, execute_and_fetch_all, memgraph
QUERY_PLAN = "QUERY PLAN"
# E2E tests for checking query semantic
# ------------------------------------
@ -96,8 +99,8 @@ def test_analyze_full_graph(analyze_query, connect):
else:
first_index = 1
# Check results
assert analyze_graph_results[first_index] == ("Label", "id1", 100, 100, 1, 0)
assert analyze_graph_results[1 - first_index] == ("Label", "id2", 50, 5, 10, 0)
assert analyze_graph_results[first_index] == ("Label", "id1", 100, 100, 1, 0, 0)
assert analyze_graph_results[1 - first_index] == ("Label", "id2", 50, 5, 10, 0, 0)
# After analyzing graph, id1 index should be chosen because it has smaller average group size
expected_explain_after_analysis = [
(f" * Produce {{n}}",),
@ -131,8 +134,8 @@ def test_cardinality_different_avg_group_size_uniform_dist(connect):
else:
first_index = 1
# Check results
assert analyze_graph_results[first_index] == ("Label", "id1", 100, 100, 1, 0)
assert analyze_graph_results[1 - first_index] == ("Label", "id2", 100, 20, 5, 0)
assert analyze_graph_results[first_index] == ("Label", "id1", 100, 100, 1, 0, 0)
assert analyze_graph_results[1 - first_index] == ("Label", "id2", 100, 20, 5, 0, 0)
expected_explain_after_analysis = [
(f" * Produce {{n}}",),
(f" * Filter",),
@ -161,8 +164,8 @@ def test_cardinality_same_avg_group_size_uniform_dist_diff_vertex_count(connect)
else:
first_index = 1
# Check results
assert analyze_graph_results[first_index] == ("Label", "id1", 100, 100, 1, 0)
assert analyze_graph_results[1 - first_index] == ("Label", "id2", 50, 50, 1, 0)
assert analyze_graph_results[first_index] == ("Label", "id1", 100, 100, 1, 0, 0)
assert analyze_graph_results[1 - first_index] == ("Label", "id2", 50, 50, 1, 0, 0)
expected_explain_after_analysis = [
(f" * Produce {{n}}",),
(f" * Filter",),
@ -191,8 +194,8 @@ def test_large_diff_in_num_vertices_v1(connect):
else:
first_index = 1
# Check results
assert analyze_graph_results[first_index] == ("Label", "id1", 1000, 1000, 1, 0)
assert analyze_graph_results[1 - first_index] == ("Label", "id2", 99, 1, 99, 0)
assert analyze_graph_results[first_index] == ("Label", "id1", 1000, 1000, 1, 0, 0)
assert analyze_graph_results[1 - first_index] == ("Label", "id2", 99, 1, 99, 0, 0)
expected_explain_after_analysis = [
(f" * Produce {{n}}",),
(f" * Filter",),
@ -221,8 +224,8 @@ def test_large_diff_in_num_vertices_v2(connect):
else:
first_index = 1
# Check results
assert analyze_graph_results[first_index] == ("Label", "id1", 99, 1, 99, 0)
assert analyze_graph_results[1 - first_index] == ("Label", "id2", 1000, 1000, 1, 0)
assert analyze_graph_results[first_index] == ("Label", "id1", 99, 1, 99, 0, 0)
assert analyze_graph_results[1 - first_index] == ("Label", "id2", 1000, 1000, 1, 0, 0)
expected_explain_after_analysis = [
(f" * Produce {{n}}",),
(f" * Filter",),
@ -261,8 +264,8 @@ def test_same_avg_group_size_diff_distribution(connect):
else:
first_index = 1
# Check results
assert analyze_graph_results[first_index] == ("Label", "id1", 100, 5, 20, 32.5)
assert analyze_graph_results[1 - first_index] == ("Label", "id2", 100, 5, 20, 0)
assert analyze_graph_results[first_index] == ("Label", "id1", 100, 5, 20, 32.5, 0)
assert analyze_graph_results[1 - first_index] == ("Label", "id2", 100, 5, 20, 0, 0)
expected_explain_after_analysis = [
(f" * Produce {{n}}",),
(f" * Filter",),
@ -278,5 +281,194 @@ def test_same_avg_group_size_diff_distribution(connect):
execute_and_fetch_all(cursor, "DROP INDEX ON :Label(id2);")
def test_given_supernode_when_expanding_then_expand_other_way_around(memgraph):
memgraph.execute("FOREACH (i in range(1, 1000) | CREATE (:Node {id: i}));")
memgraph.execute("CREATE (:SuperNode {id: 1});")
memgraph.execute("CREATE INDEX ON :SuperNode(id);")
memgraph.execute("CREATE INDEX ON :SuperNode;")
memgraph.execute("CREATE INDEX ON :Node(id);")
memgraph.execute("CREATE INDEX ON :Node;")
memgraph.execute("match (n:Node) match (s:SuperNode {id: 1}) merge (n)<-[:HAS_REL_TO]-(s);")
query = "explain match (n:Node) match (s:SuperNode {id: 1}) merge (n)<-[:HAS_REL_TO]-(s);"
expected_explain = [
f" * EmptyResult",
f" * Merge",
f" |\\ On Match",
f" | * Expand (s)-[anon3:HAS_REL_TO]->(n)",
f" | * Once",
f" |\\ On Create",
f" | * CreateExpand (n)<-[anon3:HAS_REL_TO]-(s)",
f" | * Once",
f" * ScanAllByLabel (n :Node)",
f" * ScanAllByLabelPropertyValue (s :SuperNode {{id}})",
f" * Once",
]
result_without_analysis = list(memgraph.execute_and_fetch(query))
result_without_analysis = [x[QUERY_PLAN] for x in result_without_analysis]
assert expected_explain == result_without_analysis
memgraph.execute("analyze graph;")
expected_explain = [
x.replace(f" | * Expand (s)-[anon3:HAS_REL_TO]->(n)", f" | * Expand (n)<-[anon3:HAS_REL_TO]-(s)")
for x in expected_explain
]
result_with_analysis = list(memgraph.execute_and_fetch(query))
result_with_analysis = [x[QUERY_PLAN] for x in result_with_analysis]
assert expected_explain == result_with_analysis
def test_given_supernode_when_subquery_then_carry_information_to_subquery(memgraph):
memgraph.execute("FOREACH (i in range(1, 1000) | CREATE (:Node {id: i}));")
memgraph.execute("FOREACH (i in range(1, 1000) | CREATE (:Node2 {id: i}));")
memgraph.execute("CREATE (:SuperNode {id: 1});")
memgraph.execute("CREATE INDEX ON :SuperNode(id);")
memgraph.execute("CREATE INDEX ON :SuperNode;")
memgraph.execute("CREATE INDEX ON :Node(id);")
memgraph.execute("CREATE INDEX ON :Node;")
memgraph.execute("CREATE INDEX ON :Node2(id);")
memgraph.execute("CREATE INDEX ON :Node2;")
memgraph.execute("match (n:Node) match (s:SuperNode {id: 1}) merge (n)<-[:HAS_REL_TO]-(s);")
memgraph.execute("match (n:Node2) match (s:SuperNode {id: 1}) merge (n)<-[:HAS_REL_TO]-(s);")
query = (
"explain match (n:Node) match (s:SuperNode {id: 1}) call { with n, s merge (n)<-[:HAS_REL_TO]-(s) } return 1"
)
expected_explain = [
f" * Produce {{0}}",
f" * Accumulate",
f" * Accumulate",
f" * Apply",
f" |\\ ",
f" | * EmptyResult",
f" | * Merge",
f" | |\\ On Match",
f" | | * Expand (s)-[anon3:HAS_REL_TO]->(n)",
f" | | * Once",
f" | |\\ On Create",
f" | | * CreateExpand (n)<-[anon3:HAS_REL_TO]-(s)",
f" | | * Once",
f" | * Produce {{n, s}}",
f" | * Once",
f" * ScanAllByLabel (n :Node)",
f" * ScanAllByLabelPropertyValue (s :SuperNode {{id}})",
f" * Once",
]
result_without_analysis = list(memgraph.execute_and_fetch(query))
result_without_analysis = [x[QUERY_PLAN] for x in result_without_analysis]
assert expected_explain == result_without_analysis
memgraph.execute("analyze graph;")
expected_explain = [
x.replace(f" | | * Expand (s)-[anon3:HAS_REL_TO]->(n)", f" | | * Expand (n)<-[anon3:HAS_REL_TO]-(s)")
for x in expected_explain
]
result_with_analysis = list(memgraph.execute_and_fetch(query))
result_with_analysis = [x[QUERY_PLAN] for x in result_with_analysis]
assert expected_explain == result_with_analysis
def test_given_supernode_when_subquery_and_union_then_carry_information(memgraph):
memgraph.execute("FOREACH (i in range(1, 1000) | CREATE (:Node {id: i}));")
memgraph.execute("FOREACH (i in range(1, 1000) | CREATE (:Node2 {id: i}));")
memgraph.execute("CREATE (:SuperNode {id: 1});")
memgraph.execute("CREATE INDEX ON :SuperNode(id);")
memgraph.execute("CREATE INDEX ON :SuperNode;")
memgraph.execute("CREATE INDEX ON :Node(id);")
memgraph.execute("CREATE INDEX ON :Node;")
memgraph.execute("CREATE INDEX ON :Node2(id);")
memgraph.execute("CREATE INDEX ON :Node2;")
memgraph.execute("match (n:Node) match (s:SuperNode {id: 1}) merge (n)<-[:HAS_REL_TO]-(s);")
memgraph.execute("match (n:Node2) match (s:SuperNode {id: 1}) merge (n)<-[:HAS_REL_TO]-(s);")
query = "explain match (n:Node) match (s:SuperNode {id: 1}) call { with n, s merge (n)<-[:HAS_REL_TO]-(s) } return s union all match (n:Node) match (s:SuperNode {id: 1}) call { with n, s merge (n)<-[:HAS_REL_TO]-(s) } return s;"
expected_explain = [
f" * Union {{s : s}}",
f" |\\ ",
f" | * Produce {{s}}",
f" | * Accumulate",
f" | * Accumulate",
f" | * Apply",
f" | |\\ ",
f" | | * EmptyResult",
f" | | * Merge",
f" | | |\\ On Match",
f" | | | * Expand (s)-[anon7:HAS_REL_TO]->(n)",
f" | | | * Once",
f" | | |\\ On Create",
f" | | | * CreateExpand (n)<-[anon7:HAS_REL_TO]-(s)",
f" | | | * Once",
f" | | * Produce {{n, s}}",
f" | | * Once",
f" | * ScanAllByLabel (n :Node)",
f" | * ScanAllByLabelPropertyValue (s :SuperNode {{id}})",
f" | * Once",
f" * Produce {{s}}",
f" * Accumulate",
f" * Accumulate",
f" * Apply",
f" |\\ ",
f" | * EmptyResult",
f" | * Merge",
f" | |\\ On Match",
f" | | * Expand (s)-[anon3:HAS_REL_TO]->(n)",
f" | | * Once",
f" | |\\ On Create",
f" | | * CreateExpand (n)<-[anon3:HAS_REL_TO]-(s)",
f" | | * Once",
f" | * Produce {{n, s}}",
f" | * Once",
f" * ScanAllByLabel (n :Node)",
f" * ScanAllByLabelPropertyValue (s :SuperNode {{id}})",
f" * Once",
]
result_without_analysis = list(memgraph.execute_and_fetch(query))
result_without_analysis = [x[QUERY_PLAN] for x in result_without_analysis]
assert expected_explain == result_without_analysis
memgraph.execute("analyze graph;")
expected_explain = [
x.replace(f" | | * Expand (s)-[anon3:HAS_REL_TO]->(n)", f" | | * Expand (n)<-[anon3:HAS_REL_TO]-(s)")
for x in expected_explain
]
expected_explain = [
x.replace(f" | | | * Expand (s)-[anon7:HAS_REL_TO]->(n)", f" | | | * Expand (n)<-[anon7:HAS_REL_TO]-(s)")
for x in expected_explain
]
result_with_analysis = list(memgraph.execute_and_fetch(query))
result_with_analysis = [x[QUERY_PLAN] for x in result_with_analysis]
assert expected_explain == result_with_analysis
def test_given_empty_graph_when_analyzing_graph_return_zero_degree(memgraph):
memgraph.execute("CREATE INDEX ON :Node;")
label_stats = next(memgraph.execute_and_fetch("analyze graph;"))
expected_analysis = {
"label": "Node",
"property": None,
"num estimation nodes": 0,
"num groups": None,
"avg group size": None,
"chi-squared value": None,
"avg degree": 0.0,
}
assert set(label_stats) == set(expected_analysis)
if __name__ == "__main__":
sys.exit(pytest.main([__file__, "-rA"]))

View File

@ -27,6 +27,7 @@
#include "query/plan/planner.hpp"
#include "query/plan/pretty_print.hpp"
#include "query/typed_value.hpp"
#include "storage/v2/indices.hpp"
#include "storage/v2/property_value.hpp"
#include "utils/string.hpp"
@ -213,8 +214,12 @@ class InteractiveDbAccessor {
return label_property_index_.at(key);
}
std::optional<memgraph::storage::IndexStats> GetIndexStats(memgraph::storage::LabelId label,
memgraph::storage::PropertyId property) const {
std::optional<memgraph::storage::LabelIndexStats> GetIndexStats(const memgraph::storage::LabelId label) const {
return dba_->GetIndexStats(label);
}
std::optional<memgraph::storage::LabelPropertyIndexStats> GetIndexStats(
const memgraph::storage::LabelId label, const memgraph::storage::PropertyId property) const {
return dba_->GetIndexStats(label, property);
}
@ -458,7 +463,7 @@ auto MakeLogicalPlans(memgraph::query::CypherQuery *query, memgraph::query::AstS
memgraph::query::AstStorage ast_copy;
auto unoptimized_plan = plan->Clone(&ast_copy);
auto rewritten_plan = post_process.Rewrite(std::move(plan), &ctx);
double cost = post_process.EstimatePlanCost(rewritten_plan, dba);
double cost = post_process.EstimatePlanCost(rewritten_plan, dba, symbol_table);
interactive_plans.push_back(
InteractivePlan{std::move(unoptimized_plan), std::move(ast_copy), std::move(rewritten_plan), cost});
}

View File

@ -74,7 +74,7 @@ class QueryCostEstimator : public ::testing::Test {
}
auto Cost() {
CostEstimator<memgraph::query::DbAccessor> cost_estimator(&*dba, parameters_);
CostEstimator<memgraph::query::DbAccessor> cost_estimator(&*dba, symbol_table_, parameters_);
last_op_->Accept(cost_estimator);
return cost_estimator.cost();
}
@ -201,7 +201,7 @@ TEST_F(QueryCostEstimator, SubqueryCartesian) {
std::shared_ptr<LogicalOperator> input = std::make_shared<ScanAll>(std::make_shared<Once>(), NextSymbol());
std::shared_ptr<LogicalOperator> subquery = std::make_shared<ScanAll>(std::make_shared<Once>(), NextSymbol());
MakeOp<memgraph::query::plan::Apply>(input, subquery, true);
EXPECT_COST(CostParam::kSubquery * no_vertices * no_vertices);
EXPECT_COST(CostParam::kSubquery * no_vertices * no_vertices + no_vertices);
}
TEST_F(QueryCostEstimator, UnitSubquery) {

View File

@ -500,9 +500,13 @@ class FakeDbAccessor {
return false;
}
memgraph::storage::IndexStats GetIndexStats(memgraph::storage::LabelId label,
memgraph::storage::PropertyId property) const {
return memgraph::storage::IndexStats{.statistic = 0, .avg_group_size = 1}; // unique id
std::optional<memgraph::storage::LabelPropertyIndexStats> GetIndexStats(
const memgraph::storage::LabelId label, const memgraph::storage::PropertyId property) const {
return memgraph::storage::LabelPropertyIndexStats{.statistic = 0, .avg_group_size = 1}; // unique id
}
std::optional<memgraph::storage::LabelIndexStats> GetIndexStats(const memgraph::storage::LabelId label) const {
return memgraph::storage::LabelIndexStats{.count = 0, .avg_degree = 0}; // unique id
}
void SetIndexCount(memgraph::storage::LabelId label, int64_t count) { label_index_[label] = count; }

View File

@ -1252,4 +1252,11 @@ TEST_F(TestSymbolGenerator, Subqueries) {
query = QUERY(SINGLE_QUERY(MATCH(PATTERN(NODE("n"))), CALL_SUBQUERY(subquery), RETURN("n", "m")));
symbol_table = MakeSymbolTable(query);
ASSERT_EQ(symbol_table.max_position(), 11);
// MATCH (n) CALL { MATCH (s) RETURN s } RETURN n UNION MATCH (n) CALL { MATCH (s) RETURN s } RETURN n
subquery = QUERY(SINGLE_QUERY(MATCH(PATTERN(NODE("s"))), RETURN("s")));
query = QUERY(SINGLE_QUERY(MATCH(PATTERN(NODE("n"))), CALL_SUBQUERY(subquery), RETURN("n")),
UNION(SINGLE_QUERY(MATCH(PATTERN(NODE("n"))), CALL_SUBQUERY(subquery), RETURN("n"))));
symbol_table = MakeSymbolTable(query);
ASSERT_EQ(symbol_table.max_position(), 13);
}