Add vertex degree to index statistics (#1026)
Add graph analysis of vertex degrees when doing ANALYZE GRAPH.
This commit is contained in:
parent
261aa4f49b
commit
84721f7e0a
@ -15,5 +15,5 @@
|
||||
|
||||
namespace memgraph::query {
|
||||
inline const std::string kAsterisk = "*";
|
||||
inline constexpr uint16_t kDeleteStatisticsNumResults = 6;
|
||||
inline constexpr uint16_t kComputeStatisticsNumResults = 7;
|
||||
} // namespace memgraph::query
|
||||
|
@ -430,22 +430,36 @@ class DbAccessor final {
|
||||
return accessor_->LabelPropertyIndexExists(label, prop);
|
||||
}
|
||||
|
||||
std::optional<storage::IndexStats> GetIndexStats(const storage::LabelId &label,
|
||||
const storage::PropertyId &property) const {
|
||||
std::optional<storage::LabelIndexStats> GetIndexStats(const storage::LabelId &label) const {
|
||||
return accessor_->GetIndexStats(label);
|
||||
}
|
||||
|
||||
std::optional<storage::LabelPropertyIndexStats> GetIndexStats(const storage::LabelId &label,
|
||||
const storage::PropertyId &property) const {
|
||||
return accessor_->GetIndexStats(label, property);
|
||||
}
|
||||
|
||||
std::vector<std::pair<storage::LabelId, storage::PropertyId>> ClearIndexStats() {
|
||||
return accessor_->ClearIndexStats();
|
||||
std::vector<std::pair<storage::LabelId, storage::PropertyId>> ClearLabelPropertyIndexStats() {
|
||||
return accessor_->ClearLabelPropertyIndexStats();
|
||||
}
|
||||
|
||||
std::vector<std::pair<storage::LabelId, storage::PropertyId>> DeleteIndexStatsForLabels(
|
||||
std::vector<storage::LabelId> ClearLabelIndexStats() { return accessor_->ClearLabelIndexStats(); }
|
||||
|
||||
std::vector<std::pair<storage::LabelId, storage::PropertyId>> DeleteLabelPropertyIndexStats(
|
||||
const std::span<std::string> labels) {
|
||||
return accessor_->DeleteIndexStatsForLabels(labels);
|
||||
return accessor_->DeleteLabelPropertyIndexStats(labels);
|
||||
}
|
||||
|
||||
std::vector<storage::LabelId> DeleteLabelIndexStats(const std::span<std::string> labels) {
|
||||
return accessor_->DeleteLabelIndexStats(labels);
|
||||
}
|
||||
|
||||
void SetIndexStats(const storage::LabelId &label, const storage::LabelIndexStats &stats) {
|
||||
accessor_->SetIndexStats(label, stats);
|
||||
}
|
||||
|
||||
void SetIndexStats(const storage::LabelId &label, const storage::PropertyId &property,
|
||||
const storage::IndexStats &stats) {
|
||||
const storage::LabelPropertyIndexStats &stats) {
|
||||
accessor_->SetIndexStats(label, property, stats);
|
||||
}
|
||||
|
||||
|
@ -157,25 +157,34 @@ void SymbolGenerator::VisitReturnBody(ReturnBody &body, Where *where) {
|
||||
// Query
|
||||
|
||||
bool SymbolGenerator::PreVisit(SingleQuery &) {
|
||||
prev_return_names_ = curr_return_names_;
|
||||
curr_return_names_.clear();
|
||||
auto &scope = scopes_.back();
|
||||
|
||||
scope.prev_return_names = scope.curr_return_names;
|
||||
scope.curr_return_names.clear();
|
||||
return true;
|
||||
}
|
||||
|
||||
// Union
|
||||
|
||||
bool SymbolGenerator::PreVisit(CypherUnion &) {
|
||||
scopes_.back() = Scope();
|
||||
auto next_scope = Scope();
|
||||
next_scope.curr_return_names = scopes_.back().curr_return_names;
|
||||
|
||||
scopes_.pop_back();
|
||||
scopes_.push_back(next_scope);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool SymbolGenerator::PostVisit(CypherUnion &cypher_union) {
|
||||
if (prev_return_names_ != curr_return_names_) {
|
||||
auto &scope = scopes_.back();
|
||||
|
||||
if (scope.prev_return_names != scope.curr_return_names) {
|
||||
throw SemanticException("All subqueries in an UNION must have the same column names.");
|
||||
}
|
||||
|
||||
// create new symbols for the result of the union
|
||||
for (const auto &name : curr_return_names_) {
|
||||
for (const auto &name : scope.curr_return_names) {
|
||||
auto symbol = CreateSymbol(name, false);
|
||||
cypher_union.union_symbols_.push_back(symbol);
|
||||
}
|
||||
@ -259,7 +268,9 @@ bool SymbolGenerator::PreVisit(Return &ret) {
|
||||
}
|
||||
|
||||
bool SymbolGenerator::PostVisit(Return &) {
|
||||
for (const auto &name_symbol : scopes_.back().symbols) curr_return_names_.insert(name_symbol.first);
|
||||
auto &scope = scopes_.back();
|
||||
|
||||
for (const auto &name_symbol : scope.symbols) scope.curr_return_names.insert(name_symbol.first);
|
||||
return true;
|
||||
}
|
||||
|
||||
|
@ -140,6 +140,8 @@ class SymbolGenerator : public HierarchicalTreeVisitor {
|
||||
std::vector<Identifier *> identifiers_in_match;
|
||||
// Number of nested IfOperators.
|
||||
int num_if_operators{0};
|
||||
std::unordered_set<std::string> prev_return_names{};
|
||||
std::unordered_set<std::string> curr_return_names{};
|
||||
};
|
||||
|
||||
static std::optional<Symbol> FindSymbolInScope(const std::string &name, const Scope &scope, Symbol::Type type);
|
||||
@ -171,8 +173,6 @@ class SymbolGenerator : public HierarchicalTreeVisitor {
|
||||
// is mapped by its name.
|
||||
std::unordered_map<std::string, Identifier *> predefined_identifiers_;
|
||||
std::vector<Scope> scopes_;
|
||||
std::unordered_set<std::string> prev_return_names_;
|
||||
std::unordered_set<std::string> curr_return_names_;
|
||||
};
|
||||
|
||||
inline SymbolTable MakeSymbolTable(CypherQuery *query, const std::vector<Identifier *> &predefined_identifiers = {}) {
|
||||
|
@ -1545,74 +1545,181 @@ PreparedQuery PrepareDumpQuery(ParsedQuery parsed_query, std::map<std::string, T
|
||||
std::vector<std::vector<TypedValue>> AnalyzeGraphQueryHandler::AnalyzeGraphCreateStatistics(
|
||||
const std::span<std::string> labels, DbAccessor *execution_db_accessor) {
|
||||
using LPIndex = std::pair<storage::LabelId, storage::PropertyId>;
|
||||
auto view = storage::View::OLD;
|
||||
|
||||
std::vector<std::vector<TypedValue>> results;
|
||||
std::map<LPIndex, std::map<storage::PropertyValue, int64_t>> counter;
|
||||
auto erase_not_specified_label_indices = [&labels, execution_db_accessor](auto &index_info) {
|
||||
if (labels[0] == kAsterisk) {
|
||||
return;
|
||||
}
|
||||
|
||||
// Preprocess labels to avoid later checks
|
||||
std::vector<LPIndex> indices_info = execution_db_accessor->ListAllIndices().label_property;
|
||||
if (labels[0] != kAsterisk) {
|
||||
for (auto it = indices_info.cbegin(); it != indices_info.cend();) {
|
||||
if (std::find(labels.begin(), labels.end(), execution_db_accessor->LabelToName(it->first)) == labels.end()) {
|
||||
it = indices_info.erase(it);
|
||||
for (auto it = index_info.cbegin(); it != index_info.cend();) {
|
||||
if (std::find(labels.begin(), labels.end(), execution_db_accessor->LabelToName(*it)) == labels.end()) {
|
||||
it = index_info.erase(it);
|
||||
} else {
|
||||
++it;
|
||||
}
|
||||
}
|
||||
}
|
||||
// Iterate over all indexed vertices
|
||||
std::for_each(indices_info.begin(), indices_info.end(), [execution_db_accessor, &counter](const LPIndex &index_info) {
|
||||
auto vertices = execution_db_accessor->Vertices(storage::View::OLD, index_info.first, index_info.second);
|
||||
std::for_each(vertices.begin(), vertices.end(), [&index_info, &counter](const auto &vertex) {
|
||||
counter[index_info][*vertex.GetProperty(storage::View::OLD, index_info.second)]++;
|
||||
});
|
||||
});
|
||||
};
|
||||
|
||||
results.reserve(counter.size());
|
||||
std::for_each(counter.begin(), counter.end(), [&results, execution_db_accessor](const auto &counter_entry) {
|
||||
const auto &[label_property, values_map] = counter_entry;
|
||||
std::vector<TypedValue> result;
|
||||
result.reserve(kDeleteStatisticsNumResults);
|
||||
// Extract info
|
||||
int64_t count_property_value = std::accumulate(
|
||||
values_map.begin(), values_map.end(), 0,
|
||||
[](int64_t prev_value, const auto &prop_value_count) { return prev_value + prop_value_count.second; });
|
||||
// num_distinc_values will never be 0
|
||||
double avg_group_size = static_cast<double>(count_property_value) / static_cast<double>(values_map.size());
|
||||
double chi_squared_stat = std::accumulate(
|
||||
values_map.begin(), values_map.end(), 0.0, [avg_group_size](double prev_result, const auto &value_entry) {
|
||||
return prev_result + utils::ChiSquaredValue(value_entry.second, avg_group_size);
|
||||
auto erase_not_specified_label_property_indices = [&labels, execution_db_accessor](auto &index_info) {
|
||||
if (labels[0] == kAsterisk) {
|
||||
return;
|
||||
}
|
||||
|
||||
for (auto it = index_info.cbegin(); it != index_info.cend();) {
|
||||
if (std::find(labels.begin(), labels.end(), execution_db_accessor->LabelToName(it->first)) == labels.end()) {
|
||||
it = index_info.erase(it);
|
||||
} else {
|
||||
++it;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
auto populate_label_stats = [execution_db_accessor, view](auto index_info) {
|
||||
std::vector<std::pair<storage::LabelId, storage::LabelIndexStats>> label_stats;
|
||||
label_stats.reserve(index_info.size());
|
||||
std::for_each(index_info.begin(), index_info.end(),
|
||||
[execution_db_accessor, view, &label_stats](const storage::LabelId &label_id) {
|
||||
auto vertices = execution_db_accessor->Vertices(view, label_id);
|
||||
uint64_t no_vertices{0};
|
||||
uint64_t total_degree{0};
|
||||
std::for_each(vertices.begin(), vertices.end(),
|
||||
[&total_degree, &no_vertices, &view](const auto &vertex) {
|
||||
no_vertices++;
|
||||
total_degree += *vertex.OutDegree(view) + *vertex.InDegree(view);
|
||||
});
|
||||
|
||||
auto average_degree =
|
||||
no_vertices > 0 ? static_cast<double>(total_degree) / static_cast<double>(no_vertices) : 0;
|
||||
auto index_stats = storage::LabelIndexStats{.count = no_vertices, .avg_degree = average_degree};
|
||||
execution_db_accessor->SetIndexStats(label_id, index_stats);
|
||||
label_stats.emplace_back(label_id, index_stats);
|
||||
});
|
||||
|
||||
return label_stats;
|
||||
};
|
||||
|
||||
auto populate_label_property_stats = [execution_db_accessor, view](auto &index_info) {
|
||||
std::map<LPIndex, std::map<storage::PropertyValue, int64_t>> label_property_counter;
|
||||
std::map<LPIndex, uint64_t> vertex_degree_counter;
|
||||
// Iterate over all label property indexed vertices
|
||||
std::for_each(
|
||||
index_info.begin(), index_info.end(),
|
||||
[execution_db_accessor, &label_property_counter, &vertex_degree_counter, view](const LPIndex &index_info) {
|
||||
auto vertices = execution_db_accessor->Vertices(view, index_info.first, index_info.second);
|
||||
std::for_each(vertices.begin(), vertices.end(),
|
||||
[&index_info, &label_property_counter, &vertex_degree_counter, &view](const auto &vertex) {
|
||||
label_property_counter[index_info][*vertex.GetProperty(view, index_info.second)]++;
|
||||
vertex_degree_counter[index_info] += *vertex.OutDegree(view) + *vertex.InDegree(view);
|
||||
});
|
||||
});
|
||||
execution_db_accessor->SetIndexStats(
|
||||
label_property.first, label_property.second,
|
||||
storage::IndexStats{.statistic = chi_squared_stat, .avg_group_size = avg_group_size});
|
||||
// Save result
|
||||
result.emplace_back(execution_db_accessor->LabelToName(label_property.first));
|
||||
result.emplace_back(execution_db_accessor->PropertyToName(label_property.second));
|
||||
result.emplace_back(count_property_value);
|
||||
result.emplace_back(static_cast<int64_t>(values_map.size()));
|
||||
result.emplace_back(avg_group_size);
|
||||
result.emplace_back(chi_squared_stat);
|
||||
|
||||
std::vector<std::pair<LPIndex, storage::LabelPropertyIndexStats>> label_property_stats;
|
||||
label_property_stats.reserve(label_property_counter.size());
|
||||
std::for_each(
|
||||
label_property_counter.begin(), label_property_counter.end(),
|
||||
[execution_db_accessor, &vertex_degree_counter, &label_property_stats](const auto &counter_entry) {
|
||||
const auto &[label_property, values_map] = counter_entry;
|
||||
// Extract info
|
||||
uint64_t count_property_value = std::accumulate(
|
||||
values_map.begin(), values_map.end(), 0,
|
||||
[](uint64_t prev_value, const auto &prop_value_count) { return prev_value + prop_value_count.second; });
|
||||
// num_distinc_values will never be 0
|
||||
double avg_group_size = static_cast<double>(count_property_value) / static_cast<double>(values_map.size());
|
||||
double chi_squared_stat = std::accumulate(
|
||||
values_map.begin(), values_map.end(), 0.0, [avg_group_size](double prev_result, const auto &value_entry) {
|
||||
return prev_result + utils::ChiSquaredValue(value_entry.second, avg_group_size);
|
||||
});
|
||||
|
||||
double average_degree = count_property_value > 0
|
||||
? static_cast<double>(vertex_degree_counter[label_property]) /
|
||||
static_cast<double>(count_property_value)
|
||||
: 0;
|
||||
|
||||
auto index_stats =
|
||||
storage::LabelPropertyIndexStats{.count = count_property_value,
|
||||
.distinct_values_count = static_cast<uint64_t>(values_map.size()),
|
||||
.statistic = chi_squared_stat,
|
||||
.avg_group_size = avg_group_size,
|
||||
.avg_degree = average_degree};
|
||||
execution_db_accessor->SetIndexStats(label_property.first, label_property.second, index_stats);
|
||||
label_property_stats.push_back(std::make_pair(label_property, index_stats));
|
||||
});
|
||||
|
||||
return label_property_stats;
|
||||
};
|
||||
|
||||
auto index_info = execution_db_accessor->ListAllIndices();
|
||||
|
||||
std::vector<storage::LabelId> label_indices_info = index_info.label;
|
||||
erase_not_specified_label_indices(label_indices_info);
|
||||
auto label_stats = populate_label_stats(label_indices_info);
|
||||
|
||||
std::vector<LPIndex> label_property_indices_info = index_info.label_property;
|
||||
erase_not_specified_label_property_indices(label_property_indices_info);
|
||||
auto label_property_stats = populate_label_property_stats(label_property_indices_info);
|
||||
|
||||
std::vector<std::vector<TypedValue>> results;
|
||||
results.reserve(label_stats.size() + label_property_stats.size());
|
||||
|
||||
std::for_each(label_stats.begin(), label_stats.end(), [execution_db_accessor, &results](const auto &stat_entry) {
|
||||
std::vector<TypedValue> result;
|
||||
result.reserve(kComputeStatisticsNumResults);
|
||||
|
||||
result.emplace_back(execution_db_accessor->LabelToName(stat_entry.first));
|
||||
result.emplace_back(TypedValue());
|
||||
result.emplace_back(static_cast<int64_t>(stat_entry.second.count));
|
||||
result.emplace_back(TypedValue());
|
||||
result.emplace_back(TypedValue());
|
||||
result.emplace_back(TypedValue());
|
||||
result.emplace_back(stat_entry.second.avg_degree);
|
||||
results.push_back(std::move(result));
|
||||
});
|
||||
|
||||
std::for_each(label_property_stats.begin(), label_property_stats.end(),
|
||||
[execution_db_accessor, &results](const auto &stat_entry) {
|
||||
std::vector<TypedValue> result;
|
||||
result.reserve(kComputeStatisticsNumResults);
|
||||
|
||||
result.emplace_back(execution_db_accessor->LabelToName(stat_entry.first.first));
|
||||
result.emplace_back(execution_db_accessor->PropertyToName(stat_entry.first.second));
|
||||
result.emplace_back(static_cast<int64_t>(stat_entry.second.count));
|
||||
result.emplace_back(static_cast<int64_t>(stat_entry.second.distinct_values_count));
|
||||
result.emplace_back(stat_entry.second.avg_group_size);
|
||||
result.emplace_back(stat_entry.second.statistic);
|
||||
result.emplace_back(stat_entry.second.avg_degree);
|
||||
results.push_back(std::move(result));
|
||||
});
|
||||
|
||||
return results;
|
||||
}
|
||||
|
||||
std::vector<std::vector<TypedValue>> AnalyzeGraphQueryHandler::AnalyzeGraphDeleteStatistics(
|
||||
const std::span<std::string> labels, DbAccessor *execution_db_accessor) {
|
||||
std::vector<std::pair<storage::LabelId, storage::PropertyId>> loc_results;
|
||||
std::vector<std::pair<storage::LabelId, storage::PropertyId>> label_prop_results;
|
||||
std::vector<storage::LabelId> label_results;
|
||||
if (labels[0] == kAsterisk) {
|
||||
loc_results = execution_db_accessor->ClearIndexStats();
|
||||
label_prop_results = execution_db_accessor->ClearLabelPropertyIndexStats();
|
||||
label_results = execution_db_accessor->ClearLabelIndexStats();
|
||||
} else {
|
||||
loc_results = execution_db_accessor->DeleteIndexStatsForLabels(labels);
|
||||
label_prop_results = execution_db_accessor->DeleteLabelPropertyIndexStats(labels);
|
||||
label_results = execution_db_accessor->DeleteLabelIndexStats(labels);
|
||||
}
|
||||
|
||||
std::vector<std::vector<TypedValue>> results;
|
||||
std::transform(loc_results.begin(), loc_results.end(), std::back_inserter(results),
|
||||
results.reserve(label_prop_results.size() + label_results.size());
|
||||
std::transform(label_prop_results.begin(), label_prop_results.end(), std::back_inserter(results),
|
||||
[execution_db_accessor](const auto &label_property_index) {
|
||||
return std::vector<TypedValue>{
|
||||
TypedValue(execution_db_accessor->LabelToName(label_property_index.first)),
|
||||
TypedValue(execution_db_accessor->PropertyToName(label_property_index.second))};
|
||||
});
|
||||
|
||||
std::transform(
|
||||
label_results.begin(), label_results.end(), std::back_inserter(results),
|
||||
[execution_db_accessor](const auto &label_index) {
|
||||
return std::vector<TypedValue>{TypedValue(execution_db_accessor->LabelToName(label_index)), TypedValue("")};
|
||||
});
|
||||
return results;
|
||||
}
|
||||
|
||||
@ -1621,7 +1728,8 @@ Callback HandleAnalyzeGraphQuery(AnalyzeGraphQuery *analyze_graph_query, DbAcces
|
||||
switch (analyze_graph_query->action_) {
|
||||
case AnalyzeGraphQuery::Action::ANALYZE: {
|
||||
callback.header = {"label", "property", "num estimation nodes",
|
||||
"num groups", "avg group size", "chi-squared value"};
|
||||
"num groups", "avg group size", "chi-squared value",
|
||||
"avg degree"};
|
||||
callback.fn = [handler = AnalyzeGraphQueryHandler(), labels = analyze_graph_query->labels_,
|
||||
execution_db_accessor]() mutable {
|
||||
return handler.AnalyzeGraphCreateStatistics(labels, execution_db_accessor);
|
||||
|
@ -15,9 +15,29 @@
|
||||
#include "query/parameters.hpp"
|
||||
#include "query/plan/operator.hpp"
|
||||
#include "query/typed_value.hpp"
|
||||
#include "utils/algorithm.hpp"
|
||||
#include "utils/math.hpp"
|
||||
|
||||
namespace memgraph::query::plan {
|
||||
|
||||
/**
|
||||
* The symbol statistics specify essential DB statistics which
|
||||
* help the query planner (namely here the cost estimator), to decide
|
||||
* how to do expands and other types of Cypher manipulations.
|
||||
*/
|
||||
struct SymbolStatistics {
|
||||
uint64_t count;
|
||||
double degree;
|
||||
};
|
||||
|
||||
/**
|
||||
* Scope of the statistics for every scanned symbol in
|
||||
* the operator tree.
|
||||
*/
|
||||
struct Scope {
|
||||
std::unordered_map<std::string, SymbolStatistics> symbol_stats;
|
||||
};
|
||||
|
||||
/**
|
||||
* Query plan execution time cost estimator, for comparing and choosing optimal
|
||||
* execution plans.
|
||||
@ -81,8 +101,11 @@ class CostEstimator : public HierarchicalLogicalOperatorVisitor {
|
||||
using HierarchicalLogicalOperatorVisitor::PostVisit;
|
||||
using HierarchicalLogicalOperatorVisitor::PreVisit;
|
||||
|
||||
CostEstimator(TDbAccessor *db_accessor, const Parameters ¶meters)
|
||||
: db_accessor_(db_accessor), parameters(parameters) {}
|
||||
CostEstimator(TDbAccessor *db_accessor, const SymbolTable &table, const Parameters ¶meters)
|
||||
: db_accessor_(db_accessor), table_(table), parameters(parameters), scopes_{Scope()} {}
|
||||
|
||||
CostEstimator(TDbAccessor *db_accessor, const SymbolTable &table, const Parameters ¶meters, Scope scope)
|
||||
: db_accessor_(db_accessor), table_(table), parameters(parameters), scopes_{scope} {}
|
||||
|
||||
bool PostVisit(ScanAll &) override {
|
||||
cardinality_ *= db_accessor_->VerticesCount();
|
||||
@ -92,6 +115,11 @@ class CostEstimator : public HierarchicalLogicalOperatorVisitor {
|
||||
}
|
||||
|
||||
bool PostVisit(ScanAllByLabel &scan_all_by_label) override {
|
||||
auto index_stats = db_accessor_->GetIndexStats(scan_all_by_label.label_);
|
||||
if (index_stats.has_value()) {
|
||||
SaveStatsFor(scan_all_by_label.output_symbol_, index_stats.value());
|
||||
}
|
||||
|
||||
cardinality_ *= db_accessor_->VerticesCount(scan_all_by_label.label_);
|
||||
// ScanAll performs some work for every element that is produced
|
||||
IncrementCost(CostParam::kScanAllByLabel);
|
||||
@ -102,6 +130,11 @@ class CostEstimator : public HierarchicalLogicalOperatorVisitor {
|
||||
// This cardinality estimation depends on the property value (expression).
|
||||
// If it's a constant, we can evaluate cardinality exactly, otherwise
|
||||
// we estimate
|
||||
auto index_stats = db_accessor_->GetIndexStats(logical_op.label_, logical_op.property_);
|
||||
if (index_stats.has_value()) {
|
||||
SaveStatsFor(logical_op.output_symbol_, index_stats.value());
|
||||
}
|
||||
|
||||
auto property_value = ConstPropertyValue(logical_op.expression_);
|
||||
double factor = 1.0;
|
||||
if (property_value)
|
||||
@ -119,6 +152,11 @@ class CostEstimator : public HierarchicalLogicalOperatorVisitor {
|
||||
}
|
||||
|
||||
bool PostVisit(ScanAllByLabelPropertyRange &logical_op) override {
|
||||
auto index_stats = db_accessor_->GetIndexStats(logical_op.label_, logical_op.property_);
|
||||
if (index_stats.has_value()) {
|
||||
SaveStatsFor(logical_op.output_symbol_, index_stats.value());
|
||||
}
|
||||
|
||||
// this cardinality estimation depends on Bound expressions.
|
||||
// if they are literals we can evaluate cardinality properly
|
||||
auto lower = BoundToPropertyValue(logical_op.lower_bound_);
|
||||
@ -144,6 +182,11 @@ class CostEstimator : public HierarchicalLogicalOperatorVisitor {
|
||||
}
|
||||
|
||||
bool PostVisit(ScanAllByLabelProperty &logical_op) override {
|
||||
auto index_stats = db_accessor_->GetIndexStats(logical_op.label_, logical_op.property_);
|
||||
if (index_stats.has_value()) {
|
||||
SaveStatsFor(logical_op.output_symbol_, index_stats.value());
|
||||
}
|
||||
|
||||
const auto factor = db_accessor_->VerticesCount(logical_op.label_, logical_op.property_);
|
||||
cardinality_ *= factor;
|
||||
IncrementCost(CostParam::MakeScanAllByLabelProperty);
|
||||
@ -152,6 +195,20 @@ class CostEstimator : public HierarchicalLogicalOperatorVisitor {
|
||||
|
||||
// TODO: Cost estimate ScanAllById?
|
||||
|
||||
bool PostVisit(Expand &expand) override {
|
||||
auto card_param = CardParam::kExpand;
|
||||
auto stats = GetStatsFor(expand.input_symbol_);
|
||||
|
||||
if (stats.has_value()) {
|
||||
card_param = stats.value().degree;
|
||||
}
|
||||
|
||||
cardinality_ *= card_param;
|
||||
IncrementCost(CostParam::kExpand);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
// For the given op first increments the cardinality and then cost.
|
||||
#define POST_VISIT_CARD_FIRST(NAME) \
|
||||
bool PostVisit(NAME &) override { \
|
||||
@ -160,7 +217,6 @@ class CostEstimator : public HierarchicalLogicalOperatorVisitor {
|
||||
return true; \
|
||||
}
|
||||
|
||||
POST_VISIT_CARD_FIRST(Expand);
|
||||
POST_VISIT_CARD_FIRST(ExpandVariable);
|
||||
|
||||
#undef POST_VISIT_CARD_FIRST
|
||||
@ -225,20 +281,42 @@ class CostEstimator : public HierarchicalLogicalOperatorVisitor {
|
||||
return false;
|
||||
}
|
||||
|
||||
bool PostVisit(Produce &op) override {
|
||||
auto scope = Scope();
|
||||
|
||||
// translate all the stats to the scope outside the return
|
||||
for (const auto &symbol : op.ModifiedSymbols(table_)) {
|
||||
auto stats = GetStatsFor(symbol);
|
||||
if (stats.has_value()) {
|
||||
scope.symbol_stats[symbol.name()] =
|
||||
SymbolStatistics{.count = stats.value().count, .degree = stats.value().degree};
|
||||
}
|
||||
}
|
||||
|
||||
scopes_.push_back(std::move(scope));
|
||||
return true;
|
||||
}
|
||||
|
||||
bool PreVisit(Apply &op) override {
|
||||
double input_cost = EstimateCostOnBranch(&op.input_);
|
||||
double subquery_cost = EstimateCostOnBranch(&op.subquery_);
|
||||
// Get the cost of the main branch
|
||||
op.input_->Accept(*this);
|
||||
|
||||
// if the query is a unit subquery, we don't want the cost to be zero but 1xN
|
||||
input_cost = input_cost == 0 ? 1 : input_cost;
|
||||
subquery_cost = subquery_cost == 0 ? 1 : subquery_cost;
|
||||
// Estimate cost on the subquery branch independently, use a copy
|
||||
auto &last_scope = scopes_.back();
|
||||
double subquery_cost = EstimateCostOnBranch(&op.subquery_, last_scope);
|
||||
subquery_cost = !utils::ApproxEqualDecimal(subquery_cost, 0.0) ? subquery_cost : 1;
|
||||
cardinality_ *= subquery_cost;
|
||||
|
||||
cardinality_ *= input_cost * subquery_cost;
|
||||
IncrementCost(CostParam::kSubquery);
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
bool PostVisit(EmptyResult & /*op*/) override {
|
||||
scopes_.emplace_back();
|
||||
return true;
|
||||
}
|
||||
|
||||
bool Visit(Once &) override { return true; }
|
||||
|
||||
auto cost() const { return cost_; }
|
||||
@ -255,12 +333,20 @@ class CostEstimator : public HierarchicalLogicalOperatorVisitor {
|
||||
|
||||
// accessor used for cardinality estimates in ScanAll and ScanAllByLabel
|
||||
TDbAccessor *db_accessor_;
|
||||
const SymbolTable &table_;
|
||||
const Parameters ¶meters;
|
||||
std::vector<Scope> scopes_;
|
||||
|
||||
void IncrementCost(double param) { cost_ += param * cardinality_; }
|
||||
|
||||
double EstimateCostOnBranch(std::shared_ptr<LogicalOperator> *branch) {
|
||||
CostEstimator<TDbAccessor> cost_estimator(db_accessor_, parameters);
|
||||
CostEstimator<TDbAccessor> cost_estimator(db_accessor_, table_, parameters);
|
||||
(*branch)->Accept(cost_estimator);
|
||||
return cost_estimator.cost();
|
||||
}
|
||||
|
||||
double EstimateCostOnBranch(std::shared_ptr<LogicalOperator> *branch, Scope scope) {
|
||||
CostEstimator<TDbAccessor> cost_estimator(db_accessor_, table_, parameters, scope);
|
||||
(*branch)->Accept(cost_estimator);
|
||||
return cost_estimator.cost();
|
||||
}
|
||||
@ -287,12 +373,32 @@ class CostEstimator : public HierarchicalLogicalOperatorVisitor {
|
||||
}
|
||||
return std::nullopt;
|
||||
}
|
||||
|
||||
bool HasStatsFor(const Symbol &symbol) const { return utils::Contains(scopes_.back().symbol_stats, symbol.name()); }
|
||||
|
||||
std::optional<SymbolStatistics> GetStatsFor(const Symbol &symbol) {
|
||||
if (!HasStatsFor(symbol)) {
|
||||
return std::nullopt;
|
||||
}
|
||||
|
||||
auto &scope = scopes_.back();
|
||||
return scope.symbol_stats[symbol.name()];
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void SaveStatsFor(const Symbol &symbol, T index_stats) {
|
||||
scopes_.back().symbol_stats[symbol.name()] = SymbolStatistics{
|
||||
.count = index_stats.count,
|
||||
.degree = index_stats.avg_degree,
|
||||
};
|
||||
}
|
||||
};
|
||||
|
||||
/** Returns the estimated cost of the given plan. */
|
||||
template <class TDbAccessor>
|
||||
double EstimatePlanCost(TDbAccessor *db, const Parameters ¶meters, LogicalOperator &plan) {
|
||||
CostEstimator<TDbAccessor> estimator(db, parameters);
|
||||
double EstimatePlanCost(TDbAccessor *db, const SymbolTable &table, const Parameters ¶meters,
|
||||
LogicalOperator &plan) {
|
||||
CostEstimator<TDbAccessor> estimator(db, table, parameters);
|
||||
plan.Accept(estimator);
|
||||
return estimator.cost();
|
||||
}
|
||||
|
@ -47,8 +47,9 @@ class PostProcessor final {
|
||||
}
|
||||
|
||||
template <class TVertexCounts>
|
||||
double EstimatePlanCost(const std::unique_ptr<LogicalOperator> &plan, TVertexCounts *vertex_counts) {
|
||||
return query::plan::EstimatePlanCost(vertex_counts, parameters_, *plan);
|
||||
double EstimatePlanCost(const std::unique_ptr<LogicalOperator> &plan, TVertexCounts *vertex_counts,
|
||||
const SymbolTable &table) {
|
||||
return query::plan::EstimatePlanCost(vertex_counts, table, parameters_, *plan);
|
||||
}
|
||||
};
|
||||
|
||||
@ -97,7 +98,7 @@ auto MakeLogicalPlan(TPlanningContext *context, TPlanPostProcess *post_process,
|
||||
// Plans are generated lazily and the current plan will disappear, so
|
||||
// it's ok to move it.
|
||||
auto rewritten_plan = post_process->Rewrite(std::move(plan), context);
|
||||
double cost = post_process->EstimatePlanCost(rewritten_plan, &vertex_counts);
|
||||
double cost = post_process->EstimatePlanCost(rewritten_plan, &vertex_counts, *context->symbol_table);
|
||||
if (!curr_plan || cost < total_cost) {
|
||||
curr_plan.emplace(std::move(rewritten_plan));
|
||||
total_cost = cost;
|
||||
@ -106,7 +107,7 @@ auto MakeLogicalPlan(TPlanningContext *context, TPlanPostProcess *post_process,
|
||||
} else {
|
||||
auto plan = MakeLogicalPlanForSingleQuery<RuleBasedPlanner>(query_parts, context);
|
||||
auto rewritten_plan = post_process->Rewrite(std::move(plan), context);
|
||||
total_cost = post_process->EstimatePlanCost(rewritten_plan, &vertex_counts);
|
||||
total_cost = post_process->EstimatePlanCost(rewritten_plan, &vertex_counts, *context->symbol_table);
|
||||
curr_plan.emplace(std::move(rewritten_plan));
|
||||
}
|
||||
|
||||
|
@ -505,7 +505,7 @@ class IndexLookupRewriter final : public HierarchicalLogicalOperatorVisitor {
|
||||
// FilterInfo with PropertyFilter.
|
||||
FilterInfo filter;
|
||||
int64_t vertex_count;
|
||||
std::optional<storage::IndexStats> index_stats;
|
||||
std::optional<storage::LabelPropertyIndexStats> index_stats;
|
||||
};
|
||||
|
||||
bool DefaultPreVisit() override { throw utils::NotYetImplemented("optimizing index lookup"); }
|
||||
@ -572,8 +572,8 @@ class IndexLookupRewriter final : public HierarchicalLogicalOperatorVisitor {
|
||||
* @param vertex_count: New index's number of vertices.
|
||||
* @return -1 if the new index is better, 0 if they are equal and 1 if the existing one is better.
|
||||
*/
|
||||
auto compare_indices = [](std::optional<LabelPropertyIndex> &found, std::optional<storage::IndexStats> &new_stats,
|
||||
int vertex_count) {
|
||||
auto compare_indices = [](std::optional<LabelPropertyIndex> &found,
|
||||
std::optional<storage::LabelPropertyIndexStats> &new_stats, int vertex_count) {
|
||||
if (!new_stats.has_value()) {
|
||||
return 0;
|
||||
}
|
||||
@ -610,7 +610,8 @@ class IndexLookupRewriter final : public HierarchicalLogicalOperatorVisitor {
|
||||
};
|
||||
|
||||
int64_t vertex_count = db_->VerticesCount(GetLabel(label), GetProperty(property));
|
||||
std::optional<storage::IndexStats> new_stats = db_->GetIndexStats(GetLabel(label), GetProperty(property));
|
||||
std::optional<storage::LabelPropertyIndexStats> new_stats =
|
||||
db_->GetIndexStats(GetLabel(label), GetProperty(property));
|
||||
|
||||
// Conditions, from more to less important:
|
||||
// the index with 10x less vertices is better.
|
||||
|
@ -78,8 +78,12 @@ class VertexCountCache {
|
||||
return db_->LabelPropertyIndexExists(label, property);
|
||||
}
|
||||
|
||||
std::optional<storage::IndexStats> GetIndexStats(const storage::LabelId &label,
|
||||
const storage::PropertyId &property) const {
|
||||
std::optional<storage::LabelIndexStats> GetIndexStats(const storage::LabelId &label) const {
|
||||
return db_->GetIndexStats(label);
|
||||
}
|
||||
|
||||
std::optional<storage::LabelPropertyIndexStats> GetIndexStats(const storage::LabelId &label,
|
||||
const storage::PropertyId &property) const {
|
||||
return db_->GetIndexStats(label, property);
|
||||
}
|
||||
|
||||
|
@ -478,6 +478,40 @@ void LabelIndex::RunGC() {
|
||||
}
|
||||
}
|
||||
|
||||
void LabelIndex::SetIndexStats(const storage::LabelId &label, const storage::LabelIndexStats &stats) {
|
||||
stats_[label] = stats;
|
||||
}
|
||||
|
||||
std::optional<LabelIndexStats> LabelIndex::GetIndexStats(const storage::LabelId &label) const {
|
||||
if (auto it = stats_.find(label); it != stats_.end()) {
|
||||
return it->second;
|
||||
}
|
||||
return {};
|
||||
}
|
||||
|
||||
std::vector<LabelId> LabelIndex::ClearIndexStats() {
|
||||
std::vector<LabelId> deleted_indexes;
|
||||
deleted_indexes.reserve(stats_.size());
|
||||
std::transform(stats_.begin(), stats_.end(), std::back_inserter(deleted_indexes),
|
||||
[](const auto &elem) { return elem.first; });
|
||||
stats_.clear();
|
||||
return deleted_indexes;
|
||||
}
|
||||
|
||||
std::vector<LabelId> LabelIndex::DeleteIndexStats(const storage::LabelId &label) {
|
||||
std::vector<LabelId> deleted_indexes;
|
||||
for (auto it = stats_.cbegin(); it != stats_.cend();) {
|
||||
if (it->first == label) {
|
||||
deleted_indexes.push_back(it->first);
|
||||
it = stats_.erase(it);
|
||||
} else {
|
||||
++it;
|
||||
}
|
||||
}
|
||||
|
||||
return deleted_indexes;
|
||||
}
|
||||
|
||||
bool LabelPropertyIndex::Entry::operator<(const Entry &rhs) {
|
||||
if (value < rhs.value) {
|
||||
return true;
|
||||
@ -814,8 +848,7 @@ int64_t LabelPropertyIndex::ApproximateVertexCount(LabelId label, PropertyId pro
|
||||
/*
|
||||
Iterate over all property-label pairs and deletes if label from the index is equal to label parameter.
|
||||
*/
|
||||
std::vector<std::pair<LabelId, PropertyId>> LabelPropertyIndex::DeleteIndexStatsForLabel(
|
||||
const storage::LabelId &label) {
|
||||
std::vector<std::pair<LabelId, PropertyId>> LabelPropertyIndex::DeleteIndexStats(const storage::LabelId &label) {
|
||||
std::vector<std::pair<LabelId, PropertyId>> deleted_indexes;
|
||||
for (auto it = stats_.cbegin(); it != stats_.cend();) {
|
||||
if (it->first.first == label) {
|
||||
@ -837,14 +870,14 @@ std::vector<std::pair<LabelId, PropertyId>> LabelPropertyIndex::ClearIndexStats(
|
||||
return deleted_indexes;
|
||||
}
|
||||
|
||||
void LabelPropertyIndex::SetIndexStats(const storage::LabelId &label, const storage::PropertyId &property,
|
||||
const IndexStats &stats) {
|
||||
stats_[{label, property}] = stats;
|
||||
void LabelPropertyIndex::SetIndexStats(const std::pair<storage::LabelId, storage::PropertyId> &key,
|
||||
const storage::LabelPropertyIndexStats &stats) {
|
||||
stats_[key] = stats;
|
||||
}
|
||||
|
||||
std::optional<IndexStats> LabelPropertyIndex::GetIndexStats(const storage::LabelId &label,
|
||||
const storage::PropertyId &property) const {
|
||||
if (auto it = stats_.find({label, property}); it != stats_.end()) {
|
||||
std::optional<storage::LabelPropertyIndexStats> LabelPropertyIndex::GetIndexStats(
|
||||
const std::pair<storage::LabelId, storage::PropertyId> &key) const {
|
||||
if (auto it = stats_.find(key); it != stats_.end()) {
|
||||
return it->second;
|
||||
}
|
||||
return {};
|
||||
|
@ -31,6 +31,11 @@ struct Constraints;
|
||||
using ParalellizedIndexCreationInfo =
|
||||
std::pair<std::vector<std::pair<Gid, uint64_t>> /*vertex_recovery_info*/, uint64_t /*thread_count*/>;
|
||||
|
||||
struct LabelIndexStats {
|
||||
uint64_t count;
|
||||
double avg_degree;
|
||||
};
|
||||
|
||||
class LabelIndex {
|
||||
private:
|
||||
struct Entry {
|
||||
@ -124,19 +129,29 @@ class LabelIndex {
|
||||
return it->second.size();
|
||||
}
|
||||
|
||||
void SetIndexStats(const storage::LabelId &label, const storage::LabelIndexStats &stats);
|
||||
|
||||
std::optional<storage::LabelIndexStats> GetIndexStats(const storage::LabelId &label) const;
|
||||
|
||||
std::vector<LabelId> ClearIndexStats();
|
||||
|
||||
std::vector<LabelId> DeleteIndexStats(const storage::LabelId &label);
|
||||
|
||||
void Clear() { index_.clear(); }
|
||||
|
||||
void RunGC();
|
||||
|
||||
private:
|
||||
std::map<LabelId, utils::SkipList<Entry>> index_;
|
||||
std::map<LabelId, storage::LabelIndexStats> stats_;
|
||||
Indices *indices_;
|
||||
Constraints *constraints_;
|
||||
Config::Items config_;
|
||||
};
|
||||
|
||||
struct IndexStats {
|
||||
double statistic, avg_group_size;
|
||||
struct LabelPropertyIndexStats {
|
||||
uint64_t count, distinct_values_count;
|
||||
double statistic, avg_group_size, avg_degree;
|
||||
};
|
||||
|
||||
class LabelPropertyIndex {
|
||||
@ -248,13 +263,13 @@ class LabelPropertyIndex {
|
||||
|
||||
std::vector<std::pair<LabelId, PropertyId>> ClearIndexStats();
|
||||
|
||||
std::vector<std::pair<LabelId, PropertyId>> DeleteIndexStatsForLabel(const storage::LabelId &label);
|
||||
std::vector<std::pair<LabelId, PropertyId>> DeleteIndexStats(const storage::LabelId &label);
|
||||
|
||||
void SetIndexStats(const storage::LabelId &label, const storage::PropertyId &property,
|
||||
const storage::IndexStats &stats);
|
||||
void SetIndexStats(const std::pair<storage::LabelId, storage::PropertyId> &key,
|
||||
const storage::LabelPropertyIndexStats &stats);
|
||||
|
||||
std::optional<storage::IndexStats> GetIndexStats(const storage::LabelId &label,
|
||||
const storage::PropertyId &property) const;
|
||||
std::optional<storage::LabelPropertyIndexStats> GetIndexStats(
|
||||
const std::pair<storage::LabelId, storage::PropertyId> &key) const;
|
||||
|
||||
void Clear() { index_.clear(); }
|
||||
|
||||
@ -262,7 +277,7 @@ class LabelPropertyIndex {
|
||||
|
||||
private:
|
||||
std::map<std::pair<LabelId, PropertyId>, utils::SkipList<Entry>> index_;
|
||||
std::map<std::pair<LabelId, PropertyId>, storage::IndexStats> stats_;
|
||||
std::map<std::pair<LabelId, PropertyId>, storage::LabelPropertyIndexStats> stats_;
|
||||
Indices *indices_;
|
||||
Constraints *constraints_;
|
||||
Config::Items config_;
|
||||
|
@ -267,28 +267,66 @@ class Storage final {
|
||||
return storage_->indices_.label_property_index.ApproximateVertexCount(label, property, lower, upper);
|
||||
}
|
||||
|
||||
std::optional<storage::IndexStats> GetIndexStats(const storage::LabelId &label,
|
||||
const storage::PropertyId &property) const {
|
||||
return storage_->indices_.label_property_index.GetIndexStats(label, property);
|
||||
template <typename TResult, typename TIndex, typename TIndexKey>
|
||||
std::optional<TResult> GetIndexStatsForIndex(TIndex &index, TIndexKey &&key) const {
|
||||
return index.GetIndexStats(key);
|
||||
}
|
||||
|
||||
std::vector<std::pair<LabelId, PropertyId>> ClearIndexStats() {
|
||||
return storage_->indices_.label_property_index.ClearIndexStats();
|
||||
std::optional<storage::LabelIndexStats> GetIndexStats(const storage::LabelId &label) const {
|
||||
return GetIndexStatsForIndex<storage::LabelIndexStats>(storage_->indices_.label_index, label);
|
||||
}
|
||||
|
||||
std::vector<std::pair<LabelId, PropertyId>> DeleteIndexStatsForLabels(const std::span<std::string> labels) {
|
||||
std::vector<std::pair<LabelId, PropertyId>> deleted_indexes;
|
||||
std::for_each(labels.begin(), labels.end(), [this, &deleted_indexes](const auto &label_str) {
|
||||
std::vector<std::pair<LabelId, PropertyId>> loc_results =
|
||||
storage_->indices_.label_property_index.DeleteIndexStatsForLabel(NameToLabel(label_str));
|
||||
std::optional<storage::LabelPropertyIndexStats> GetIndexStats(const storage::LabelId &label,
|
||||
const storage::PropertyId &property) const {
|
||||
return GetIndexStatsForIndex<storage::LabelPropertyIndexStats>(storage_->indices_.label_property_index,
|
||||
std::make_pair(label, property));
|
||||
}
|
||||
|
||||
template <typename TIndex, typename TIndexKey, typename TIndexStats>
|
||||
void SetIndexStatsForIndex(TIndex &index, TIndexKey &&key, TIndexStats &stats) const {
|
||||
index.SetIndexStats(key, stats);
|
||||
}
|
||||
|
||||
void SetIndexStats(const storage::LabelId &label, const LabelIndexStats &stats) {
|
||||
SetIndexStatsForIndex(storage_->indices_.label_index, label, stats);
|
||||
}
|
||||
|
||||
void SetIndexStats(const storage::LabelId &label, const storage::PropertyId &property,
|
||||
const LabelPropertyIndexStats &stats) {
|
||||
SetIndexStatsForIndex(storage_->indices_.label_property_index, std::make_pair(label, property), stats);
|
||||
}
|
||||
|
||||
template <typename TResult, typename TIndex>
|
||||
std::vector<TResult> ClearIndexStatsForIndex(TIndex &index) const {
|
||||
return index.ClearIndexStats();
|
||||
}
|
||||
|
||||
std::vector<std::pair<LabelId, PropertyId>> ClearLabelPropertyIndexStats() {
|
||||
return ClearIndexStatsForIndex<std::pair<LabelId, PropertyId>>(storage_->indices_.label_property_index);
|
||||
}
|
||||
|
||||
std::vector<LabelId> ClearLabelIndexStats() {
|
||||
return ClearIndexStatsForIndex<LabelId>(storage_->indices_.label_index);
|
||||
}
|
||||
|
||||
template <typename TResult, typename TIndex>
|
||||
std::vector<TResult> DeleteIndexStatsForIndex(TIndex &index, const std::span<std::string> labels) {
|
||||
std::vector<TResult> deleted_indexes;
|
||||
|
||||
for (const auto &label : labels) {
|
||||
std::vector<TResult> loc_results = index.DeleteIndexStats(NameToLabel(label));
|
||||
deleted_indexes.insert(deleted_indexes.end(), std::make_move_iterator(loc_results.begin()),
|
||||
std::make_move_iterator(loc_results.end()));
|
||||
});
|
||||
}
|
||||
return deleted_indexes;
|
||||
}
|
||||
|
||||
void SetIndexStats(const storage::LabelId &label, const storage::PropertyId &property, const IndexStats &stats) {
|
||||
storage_->indices_.label_property_index.SetIndexStats(label, property, stats);
|
||||
std::vector<std::pair<LabelId, PropertyId>> DeleteLabelPropertyIndexStats(const std::span<std::string> labels) {
|
||||
return DeleteIndexStatsForIndex<std::pair<LabelId, PropertyId>>(storage_->indices_.label_property_index, labels);
|
||||
}
|
||||
|
||||
std::vector<LabelId> DeleteLabelIndexStats(const std::span<std::string> labels) {
|
||||
return DeleteIndexStatsForIndex<LabelId>(storage_->indices_.label_index, labels);
|
||||
}
|
||||
|
||||
/// @return Accessor to the deleted vertex if a deletion took place, std::nullopt otherwise
|
||||
|
@ -131,7 +131,7 @@ static void BM_PlanAndEstimateIndexedMatching(benchmark::State &state) {
|
||||
auto plans = memgraph::query::plan::MakeLogicalPlanForSingleQuery<memgraph::query::plan::VariableStartPlanner>(
|
||||
query_parts, &ctx);
|
||||
for (auto plan : plans) {
|
||||
memgraph::query::plan::EstimatePlanCost(&dba, parameters, *plan);
|
||||
memgraph::query::plan::EstimatePlanCost(&dba, symbol_table, parameters, *plan);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -161,7 +161,7 @@ static void BM_PlanAndEstimateIndexedMatchingWithCachedCounts(benchmark::State &
|
||||
auto plans = memgraph::query::plan::MakeLogicalPlanForSingleQuery<memgraph::query::plan::VariableStartPlanner>(
|
||||
query_parts, &ctx);
|
||||
for (auto plan : plans) {
|
||||
memgraph::query::plan::EstimatePlanCost(&vertex_counts, parameters, *plan);
|
||||
memgraph::query::plan::EstimatePlanCost(&vertex_counts, symbol_table, parameters, *plan);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -13,6 +13,7 @@ import typing
|
||||
|
||||
import mgclient
|
||||
import pytest
|
||||
from gqlalchemy import Memgraph
|
||||
|
||||
|
||||
def execute_and_fetch_all(cursor: mgclient.Cursor, query: str, params: dict = {}) -> typing.List[tuple]:
|
||||
@ -27,3 +28,14 @@ def connect(**kwargs) -> mgclient.Connection:
|
||||
yield connection
|
||||
cursor = connection.cursor()
|
||||
execute_and_fetch_all(cursor, "MATCH (n) DETACH DELETE n")
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def memgraph(**kwargs) -> Memgraph:
|
||||
memgraph = Memgraph()
|
||||
|
||||
yield memgraph
|
||||
|
||||
memgraph.drop_database()
|
||||
memgraph.execute("analyze graph delete statistics;")
|
||||
memgraph.drop_indexes()
|
||||
|
@ -12,7 +12,10 @@
|
||||
import sys
|
||||
|
||||
import pytest
|
||||
from common import connect, execute_and_fetch_all
|
||||
from common import connect, execute_and_fetch_all, memgraph
|
||||
|
||||
QUERY_PLAN = "QUERY PLAN"
|
||||
|
||||
|
||||
# E2E tests for checking query semantic
|
||||
# ------------------------------------
|
||||
@ -96,8 +99,8 @@ def test_analyze_full_graph(analyze_query, connect):
|
||||
else:
|
||||
first_index = 1
|
||||
# Check results
|
||||
assert analyze_graph_results[first_index] == ("Label", "id1", 100, 100, 1, 0)
|
||||
assert analyze_graph_results[1 - first_index] == ("Label", "id2", 50, 5, 10, 0)
|
||||
assert analyze_graph_results[first_index] == ("Label", "id1", 100, 100, 1, 0, 0)
|
||||
assert analyze_graph_results[1 - first_index] == ("Label", "id2", 50, 5, 10, 0, 0)
|
||||
# After analyzing graph, id1 index should be chosen because it has smaller average group size
|
||||
expected_explain_after_analysis = [
|
||||
(f" * Produce {{n}}",),
|
||||
@ -131,8 +134,8 @@ def test_cardinality_different_avg_group_size_uniform_dist(connect):
|
||||
else:
|
||||
first_index = 1
|
||||
# Check results
|
||||
assert analyze_graph_results[first_index] == ("Label", "id1", 100, 100, 1, 0)
|
||||
assert analyze_graph_results[1 - first_index] == ("Label", "id2", 100, 20, 5, 0)
|
||||
assert analyze_graph_results[first_index] == ("Label", "id1", 100, 100, 1, 0, 0)
|
||||
assert analyze_graph_results[1 - first_index] == ("Label", "id2", 100, 20, 5, 0, 0)
|
||||
expected_explain_after_analysis = [
|
||||
(f" * Produce {{n}}",),
|
||||
(f" * Filter",),
|
||||
@ -161,8 +164,8 @@ def test_cardinality_same_avg_group_size_uniform_dist_diff_vertex_count(connect)
|
||||
else:
|
||||
first_index = 1
|
||||
# Check results
|
||||
assert analyze_graph_results[first_index] == ("Label", "id1", 100, 100, 1, 0)
|
||||
assert analyze_graph_results[1 - first_index] == ("Label", "id2", 50, 50, 1, 0)
|
||||
assert analyze_graph_results[first_index] == ("Label", "id1", 100, 100, 1, 0, 0)
|
||||
assert analyze_graph_results[1 - first_index] == ("Label", "id2", 50, 50, 1, 0, 0)
|
||||
expected_explain_after_analysis = [
|
||||
(f" * Produce {{n}}",),
|
||||
(f" * Filter",),
|
||||
@ -191,8 +194,8 @@ def test_large_diff_in_num_vertices_v1(connect):
|
||||
else:
|
||||
first_index = 1
|
||||
# Check results
|
||||
assert analyze_graph_results[first_index] == ("Label", "id1", 1000, 1000, 1, 0)
|
||||
assert analyze_graph_results[1 - first_index] == ("Label", "id2", 99, 1, 99, 0)
|
||||
assert analyze_graph_results[first_index] == ("Label", "id1", 1000, 1000, 1, 0, 0)
|
||||
assert analyze_graph_results[1 - first_index] == ("Label", "id2", 99, 1, 99, 0, 0)
|
||||
expected_explain_after_analysis = [
|
||||
(f" * Produce {{n}}",),
|
||||
(f" * Filter",),
|
||||
@ -221,8 +224,8 @@ def test_large_diff_in_num_vertices_v2(connect):
|
||||
else:
|
||||
first_index = 1
|
||||
# Check results
|
||||
assert analyze_graph_results[first_index] == ("Label", "id1", 99, 1, 99, 0)
|
||||
assert analyze_graph_results[1 - first_index] == ("Label", "id2", 1000, 1000, 1, 0)
|
||||
assert analyze_graph_results[first_index] == ("Label", "id1", 99, 1, 99, 0, 0)
|
||||
assert analyze_graph_results[1 - first_index] == ("Label", "id2", 1000, 1000, 1, 0, 0)
|
||||
expected_explain_after_analysis = [
|
||||
(f" * Produce {{n}}",),
|
||||
(f" * Filter",),
|
||||
@ -261,8 +264,8 @@ def test_same_avg_group_size_diff_distribution(connect):
|
||||
else:
|
||||
first_index = 1
|
||||
# Check results
|
||||
assert analyze_graph_results[first_index] == ("Label", "id1", 100, 5, 20, 32.5)
|
||||
assert analyze_graph_results[1 - first_index] == ("Label", "id2", 100, 5, 20, 0)
|
||||
assert analyze_graph_results[first_index] == ("Label", "id1", 100, 5, 20, 32.5, 0)
|
||||
assert analyze_graph_results[1 - first_index] == ("Label", "id2", 100, 5, 20, 0, 0)
|
||||
expected_explain_after_analysis = [
|
||||
(f" * Produce {{n}}",),
|
||||
(f" * Filter",),
|
||||
@ -278,5 +281,194 @@ def test_same_avg_group_size_diff_distribution(connect):
|
||||
execute_and_fetch_all(cursor, "DROP INDEX ON :Label(id2);")
|
||||
|
||||
|
||||
def test_given_supernode_when_expanding_then_expand_other_way_around(memgraph):
|
||||
memgraph.execute("FOREACH (i in range(1, 1000) | CREATE (:Node {id: i}));")
|
||||
memgraph.execute("CREATE (:SuperNode {id: 1});")
|
||||
memgraph.execute("CREATE INDEX ON :SuperNode(id);")
|
||||
memgraph.execute("CREATE INDEX ON :SuperNode;")
|
||||
memgraph.execute("CREATE INDEX ON :Node(id);")
|
||||
memgraph.execute("CREATE INDEX ON :Node;")
|
||||
memgraph.execute("match (n:Node) match (s:SuperNode {id: 1}) merge (n)<-[:HAS_REL_TO]-(s);")
|
||||
|
||||
query = "explain match (n:Node) match (s:SuperNode {id: 1}) merge (n)<-[:HAS_REL_TO]-(s);"
|
||||
expected_explain = [
|
||||
f" * EmptyResult",
|
||||
f" * Merge",
|
||||
f" |\\ On Match",
|
||||
f" | * Expand (s)-[anon3:HAS_REL_TO]->(n)",
|
||||
f" | * Once",
|
||||
f" |\\ On Create",
|
||||
f" | * CreateExpand (n)<-[anon3:HAS_REL_TO]-(s)",
|
||||
f" | * Once",
|
||||
f" * ScanAllByLabel (n :Node)",
|
||||
f" * ScanAllByLabelPropertyValue (s :SuperNode {{id}})",
|
||||
f" * Once",
|
||||
]
|
||||
|
||||
result_without_analysis = list(memgraph.execute_and_fetch(query))
|
||||
result_without_analysis = [x[QUERY_PLAN] for x in result_without_analysis]
|
||||
assert expected_explain == result_without_analysis
|
||||
|
||||
memgraph.execute("analyze graph;")
|
||||
|
||||
expected_explain = [
|
||||
x.replace(f" | * Expand (s)-[anon3:HAS_REL_TO]->(n)", f" | * Expand (n)<-[anon3:HAS_REL_TO]-(s)")
|
||||
for x in expected_explain
|
||||
]
|
||||
|
||||
result_with_analysis = list(memgraph.execute_and_fetch(query))
|
||||
result_with_analysis = [x[QUERY_PLAN] for x in result_with_analysis]
|
||||
|
||||
assert expected_explain == result_with_analysis
|
||||
|
||||
|
||||
def test_given_supernode_when_subquery_then_carry_information_to_subquery(memgraph):
|
||||
memgraph.execute("FOREACH (i in range(1, 1000) | CREATE (:Node {id: i}));")
|
||||
memgraph.execute("FOREACH (i in range(1, 1000) | CREATE (:Node2 {id: i}));")
|
||||
memgraph.execute("CREATE (:SuperNode {id: 1});")
|
||||
memgraph.execute("CREATE INDEX ON :SuperNode(id);")
|
||||
memgraph.execute("CREATE INDEX ON :SuperNode;")
|
||||
memgraph.execute("CREATE INDEX ON :Node(id);")
|
||||
memgraph.execute("CREATE INDEX ON :Node;")
|
||||
memgraph.execute("CREATE INDEX ON :Node2(id);")
|
||||
memgraph.execute("CREATE INDEX ON :Node2;")
|
||||
|
||||
memgraph.execute("match (n:Node) match (s:SuperNode {id: 1}) merge (n)<-[:HAS_REL_TO]-(s);")
|
||||
memgraph.execute("match (n:Node2) match (s:SuperNode {id: 1}) merge (n)<-[:HAS_REL_TO]-(s);")
|
||||
|
||||
query = (
|
||||
"explain match (n:Node) match (s:SuperNode {id: 1}) call { with n, s merge (n)<-[:HAS_REL_TO]-(s) } return 1"
|
||||
)
|
||||
expected_explain = [
|
||||
f" * Produce {{0}}",
|
||||
f" * Accumulate",
|
||||
f" * Accumulate",
|
||||
f" * Apply",
|
||||
f" |\\ ",
|
||||
f" | * EmptyResult",
|
||||
f" | * Merge",
|
||||
f" | |\\ On Match",
|
||||
f" | | * Expand (s)-[anon3:HAS_REL_TO]->(n)",
|
||||
f" | | * Once",
|
||||
f" | |\\ On Create",
|
||||
f" | | * CreateExpand (n)<-[anon3:HAS_REL_TO]-(s)",
|
||||
f" | | * Once",
|
||||
f" | * Produce {{n, s}}",
|
||||
f" | * Once",
|
||||
f" * ScanAllByLabel (n :Node)",
|
||||
f" * ScanAllByLabelPropertyValue (s :SuperNode {{id}})",
|
||||
f" * Once",
|
||||
]
|
||||
|
||||
result_without_analysis = list(memgraph.execute_and_fetch(query))
|
||||
result_without_analysis = [x[QUERY_PLAN] for x in result_without_analysis]
|
||||
assert expected_explain == result_without_analysis
|
||||
|
||||
memgraph.execute("analyze graph;")
|
||||
|
||||
expected_explain = [
|
||||
x.replace(f" | | * Expand (s)-[anon3:HAS_REL_TO]->(n)", f" | | * Expand (n)<-[anon3:HAS_REL_TO]-(s)")
|
||||
for x in expected_explain
|
||||
]
|
||||
result_with_analysis = list(memgraph.execute_and_fetch(query))
|
||||
result_with_analysis = [x[QUERY_PLAN] for x in result_with_analysis]
|
||||
|
||||
assert expected_explain == result_with_analysis
|
||||
|
||||
|
||||
def test_given_supernode_when_subquery_and_union_then_carry_information(memgraph):
|
||||
memgraph.execute("FOREACH (i in range(1, 1000) | CREATE (:Node {id: i}));")
|
||||
memgraph.execute("FOREACH (i in range(1, 1000) | CREATE (:Node2 {id: i}));")
|
||||
memgraph.execute("CREATE (:SuperNode {id: 1});")
|
||||
memgraph.execute("CREATE INDEX ON :SuperNode(id);")
|
||||
memgraph.execute("CREATE INDEX ON :SuperNode;")
|
||||
memgraph.execute("CREATE INDEX ON :Node(id);")
|
||||
memgraph.execute("CREATE INDEX ON :Node;")
|
||||
memgraph.execute("CREATE INDEX ON :Node2(id);")
|
||||
memgraph.execute("CREATE INDEX ON :Node2;")
|
||||
|
||||
memgraph.execute("match (n:Node) match (s:SuperNode {id: 1}) merge (n)<-[:HAS_REL_TO]-(s);")
|
||||
memgraph.execute("match (n:Node2) match (s:SuperNode {id: 1}) merge (n)<-[:HAS_REL_TO]-(s);")
|
||||
|
||||
query = "explain match (n:Node) match (s:SuperNode {id: 1}) call { with n, s merge (n)<-[:HAS_REL_TO]-(s) } return s union all match (n:Node) match (s:SuperNode {id: 1}) call { with n, s merge (n)<-[:HAS_REL_TO]-(s) } return s;"
|
||||
expected_explain = [
|
||||
f" * Union {{s : s}}",
|
||||
f" |\\ ",
|
||||
f" | * Produce {{s}}",
|
||||
f" | * Accumulate",
|
||||
f" | * Accumulate",
|
||||
f" | * Apply",
|
||||
f" | |\\ ",
|
||||
f" | | * EmptyResult",
|
||||
f" | | * Merge",
|
||||
f" | | |\\ On Match",
|
||||
f" | | | * Expand (s)-[anon7:HAS_REL_TO]->(n)",
|
||||
f" | | | * Once",
|
||||
f" | | |\\ On Create",
|
||||
f" | | | * CreateExpand (n)<-[anon7:HAS_REL_TO]-(s)",
|
||||
f" | | | * Once",
|
||||
f" | | * Produce {{n, s}}",
|
||||
f" | | * Once",
|
||||
f" | * ScanAllByLabel (n :Node)",
|
||||
f" | * ScanAllByLabelPropertyValue (s :SuperNode {{id}})",
|
||||
f" | * Once",
|
||||
f" * Produce {{s}}",
|
||||
f" * Accumulate",
|
||||
f" * Accumulate",
|
||||
f" * Apply",
|
||||
f" |\\ ",
|
||||
f" | * EmptyResult",
|
||||
f" | * Merge",
|
||||
f" | |\\ On Match",
|
||||
f" | | * Expand (s)-[anon3:HAS_REL_TO]->(n)",
|
||||
f" | | * Once",
|
||||
f" | |\\ On Create",
|
||||
f" | | * CreateExpand (n)<-[anon3:HAS_REL_TO]-(s)",
|
||||
f" | | * Once",
|
||||
f" | * Produce {{n, s}}",
|
||||
f" | * Once",
|
||||
f" * ScanAllByLabel (n :Node)",
|
||||
f" * ScanAllByLabelPropertyValue (s :SuperNode {{id}})",
|
||||
f" * Once",
|
||||
]
|
||||
|
||||
result_without_analysis = list(memgraph.execute_and_fetch(query))
|
||||
result_without_analysis = [x[QUERY_PLAN] for x in result_without_analysis]
|
||||
assert expected_explain == result_without_analysis
|
||||
|
||||
memgraph.execute("analyze graph;")
|
||||
|
||||
expected_explain = [
|
||||
x.replace(f" | | * Expand (s)-[anon3:HAS_REL_TO]->(n)", f" | | * Expand (n)<-[anon3:HAS_REL_TO]-(s)")
|
||||
for x in expected_explain
|
||||
]
|
||||
expected_explain = [
|
||||
x.replace(f" | | | * Expand (s)-[anon7:HAS_REL_TO]->(n)", f" | | | * Expand (n)<-[anon7:HAS_REL_TO]-(s)")
|
||||
for x in expected_explain
|
||||
]
|
||||
result_with_analysis = list(memgraph.execute_and_fetch(query))
|
||||
result_with_analysis = [x[QUERY_PLAN] for x in result_with_analysis]
|
||||
|
||||
assert expected_explain == result_with_analysis
|
||||
|
||||
|
||||
def test_given_empty_graph_when_analyzing_graph_return_zero_degree(memgraph):
|
||||
memgraph.execute("CREATE INDEX ON :Node;")
|
||||
|
||||
label_stats = next(memgraph.execute_and_fetch("analyze graph;"))
|
||||
|
||||
expected_analysis = {
|
||||
"label": "Node",
|
||||
"property": None,
|
||||
"num estimation nodes": 0,
|
||||
"num groups": None,
|
||||
"avg group size": None,
|
||||
"chi-squared value": None,
|
||||
"avg degree": 0.0,
|
||||
}
|
||||
|
||||
assert set(label_stats) == set(expected_analysis)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(pytest.main([__file__, "-rA"]))
|
||||
|
@ -27,6 +27,7 @@
|
||||
#include "query/plan/planner.hpp"
|
||||
#include "query/plan/pretty_print.hpp"
|
||||
#include "query/typed_value.hpp"
|
||||
#include "storage/v2/indices.hpp"
|
||||
#include "storage/v2/property_value.hpp"
|
||||
#include "utils/string.hpp"
|
||||
|
||||
@ -213,8 +214,12 @@ class InteractiveDbAccessor {
|
||||
return label_property_index_.at(key);
|
||||
}
|
||||
|
||||
std::optional<memgraph::storage::IndexStats> GetIndexStats(memgraph::storage::LabelId label,
|
||||
memgraph::storage::PropertyId property) const {
|
||||
std::optional<memgraph::storage::LabelIndexStats> GetIndexStats(const memgraph::storage::LabelId label) const {
|
||||
return dba_->GetIndexStats(label);
|
||||
}
|
||||
|
||||
std::optional<memgraph::storage::LabelPropertyIndexStats> GetIndexStats(
|
||||
const memgraph::storage::LabelId label, const memgraph::storage::PropertyId property) const {
|
||||
return dba_->GetIndexStats(label, property);
|
||||
}
|
||||
|
||||
@ -458,7 +463,7 @@ auto MakeLogicalPlans(memgraph::query::CypherQuery *query, memgraph::query::AstS
|
||||
memgraph::query::AstStorage ast_copy;
|
||||
auto unoptimized_plan = plan->Clone(&ast_copy);
|
||||
auto rewritten_plan = post_process.Rewrite(std::move(plan), &ctx);
|
||||
double cost = post_process.EstimatePlanCost(rewritten_plan, dba);
|
||||
double cost = post_process.EstimatePlanCost(rewritten_plan, dba, symbol_table);
|
||||
interactive_plans.push_back(
|
||||
InteractivePlan{std::move(unoptimized_plan), std::move(ast_copy), std::move(rewritten_plan), cost});
|
||||
}
|
||||
|
@ -74,7 +74,7 @@ class QueryCostEstimator : public ::testing::Test {
|
||||
}
|
||||
|
||||
auto Cost() {
|
||||
CostEstimator<memgraph::query::DbAccessor> cost_estimator(&*dba, parameters_);
|
||||
CostEstimator<memgraph::query::DbAccessor> cost_estimator(&*dba, symbol_table_, parameters_);
|
||||
last_op_->Accept(cost_estimator);
|
||||
return cost_estimator.cost();
|
||||
}
|
||||
@ -201,7 +201,7 @@ TEST_F(QueryCostEstimator, SubqueryCartesian) {
|
||||
std::shared_ptr<LogicalOperator> input = std::make_shared<ScanAll>(std::make_shared<Once>(), NextSymbol());
|
||||
std::shared_ptr<LogicalOperator> subquery = std::make_shared<ScanAll>(std::make_shared<Once>(), NextSymbol());
|
||||
MakeOp<memgraph::query::plan::Apply>(input, subquery, true);
|
||||
EXPECT_COST(CostParam::kSubquery * no_vertices * no_vertices);
|
||||
EXPECT_COST(CostParam::kSubquery * no_vertices * no_vertices + no_vertices);
|
||||
}
|
||||
|
||||
TEST_F(QueryCostEstimator, UnitSubquery) {
|
||||
|
@ -500,9 +500,13 @@ class FakeDbAccessor {
|
||||
return false;
|
||||
}
|
||||
|
||||
memgraph::storage::IndexStats GetIndexStats(memgraph::storage::LabelId label,
|
||||
memgraph::storage::PropertyId property) const {
|
||||
return memgraph::storage::IndexStats{.statistic = 0, .avg_group_size = 1}; // unique id
|
||||
std::optional<memgraph::storage::LabelPropertyIndexStats> GetIndexStats(
|
||||
const memgraph::storage::LabelId label, const memgraph::storage::PropertyId property) const {
|
||||
return memgraph::storage::LabelPropertyIndexStats{.statistic = 0, .avg_group_size = 1}; // unique id
|
||||
}
|
||||
|
||||
std::optional<memgraph::storage::LabelIndexStats> GetIndexStats(const memgraph::storage::LabelId label) const {
|
||||
return memgraph::storage::LabelIndexStats{.count = 0, .avg_degree = 0}; // unique id
|
||||
}
|
||||
|
||||
void SetIndexCount(memgraph::storage::LabelId label, int64_t count) { label_index_[label] = count; }
|
||||
|
@ -1252,4 +1252,11 @@ TEST_F(TestSymbolGenerator, Subqueries) {
|
||||
query = QUERY(SINGLE_QUERY(MATCH(PATTERN(NODE("n"))), CALL_SUBQUERY(subquery), RETURN("n", "m")));
|
||||
symbol_table = MakeSymbolTable(query);
|
||||
ASSERT_EQ(symbol_table.max_position(), 11);
|
||||
|
||||
// MATCH (n) CALL { MATCH (s) RETURN s } RETURN n UNION MATCH (n) CALL { MATCH (s) RETURN s } RETURN n
|
||||
subquery = QUERY(SINGLE_QUERY(MATCH(PATTERN(NODE("s"))), RETURN("s")));
|
||||
query = QUERY(SINGLE_QUERY(MATCH(PATTERN(NODE("n"))), CALL_SUBQUERY(subquery), RETURN("n")),
|
||||
UNION(SINGLE_QUERY(MATCH(PATTERN(NODE("n"))), CALL_SUBQUERY(subquery), RETURN("n"))));
|
||||
symbol_table = MakeSymbolTable(query);
|
||||
ASSERT_EQ(symbol_table.max_position(), 13);
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user