From bce4407ebef4757f66157c1305eda62a71362275 Mon Sep 17 00:00:00 2001 From: florijan Date: Thu, 3 Aug 2017 14:18:19 +0200 Subject: [PATCH] CostEstimator - adding new ScanAll and Expand ops Summary: Now all ScanAll and Expand ops are covered by the cost estimator. For ScanAll with indices cost estimation is pretty good, for new Expand ops it is tragically bad (Expand to the power of expansion depth, plus arbitrary filtering). Static cost estimation is wrong wrong wrong. Currently cost estimation of even trivial plans that use indices is wrong because the planner leaves filtering expressions that are implicitly handled by the index in the operator tree, IIRC. Tasking Teon to revise this, even though I'm not sure how bad an influence this has on cost estimation and it's use in plan choosing. Reviewers: mislav.bradac, teon.banek, buda Reviewed By: mislav.bradac Subscribers: pullbot Differential Revision: https://phabricator.memgraph.io/D633 --- src/query/plan/cost_estimator.cpp | 113 ++++++++++++++++++++++++---- src/query/plan/cost_estimator.hpp | 54 ++++++++----- src/query/typed_value.cpp | 14 ++++ src/query/typed_value.hpp | 4 + tests/unit/query_cost_estimator.cpp | 105 ++++++++++++++++++++++---- 5 files changed, 242 insertions(+), 48 deletions(-) diff --git a/src/query/plan/cost_estimator.cpp b/src/query/plan/cost_estimator.cpp index c642bcd5c..d029b12f7 100644 --- a/src/query/plan/cost_estimator.cpp +++ b/src/query/plan/cost_estimator.cpp @@ -1,3 +1,5 @@ +#include + #include "cost_estimator.hpp" namespace query::plan { @@ -11,31 +13,112 @@ bool CostEstimator::PostVisit(ScanAll &) { bool CostEstimator::PostVisit(ScanAllByLabel &scan_all_by_label) { cardinality_ *= db_accessor_.vertices_count(scan_all_by_label.label()); - // ScanAllByLabel performs some work for every element that is produced + // ScanAll performs some work for every element that is produced IncrementCost(CostParam::kScanAllByLabel); return true; } -bool CostEstimator::PostVisit(Expand &) { - cardinality_ *= CardParam::kExpand; - // Expand performs some work for every expansion - IncrementCost(CostParam::kExpand); +bool CostEstimator::PostVisit(ScanAllByLabelPropertyValue &logical_op) { + // this cardinality estimation depends on the property value (expression). + // if it's a literal (const) we can evaluate cardinality exactly, otherwise + // we estimate + std::experimental::optional property_value = + std::experimental::nullopt; + if (auto *literal = dynamic_cast(logical_op.expression())) + if (literal->value_.IsPropertyValue()) + property_value = + std::experimental::optional(literal->value_); + + double factor = 1.0; + if (property_value) + // get the exact influence based on ScanAll(label, property, value) + factor = db_accessor_.vertices_count( + logical_op.label(), logical_op.property(), property_value.value()); + else + // estimate the influence as ScanAll(label, property) * filtering + factor = + db_accessor_.vertices_count(logical_op.label(), logical_op.property()) * + CardParam::kFilter; + + cardinality_ *= factor; + + // ScanAll performs some work for every element that is produced + IncrementCost(CostParam::MakeScanAllByLabelPropertyValue); return true; } -// for the given op first increments the cost and then cardinality -#define POST_VISIT(LOGICAL_OP, PARAM_NAME) \ - bool CostEstimator::PostVisit(LOGICAL_OP &) { \ - IncrementCost(CostParam::PARAM_NAME); \ - cardinality_ *= CardParam::PARAM_NAME; \ - return true; \ +namespace { +// converts an optional ScanAll range bound into a property value +// if the bound is present and is a literal expression convertible to +// a property value. otherwise returns nullopt +std::experimental::optional> BoundToPropertyValue( + std::experimental::optional bound) { + if (bound) + if (auto *literal = dynamic_cast(bound->value())) + return std::experimental::make_optional( + utils::Bound(literal->value_, bound->type())); + return std::experimental::nullopt; +} +} + +bool CostEstimator::PostVisit(ScanAllByLabelPropertyRange &logical_op) { + // this cardinality estimation depends on Bound expressions. + // if they are literals we can evaluate cardinality properly + auto lower = BoundToPropertyValue(logical_op.lower_bound()); + auto upper = BoundToPropertyValue(logical_op.upper_bound()); + + int64_t factor = 1; + if (upper || lower) + // if we have either Bound, use the value index + factor = db_accessor_.vertices_count(logical_op.label(), + logical_op.property(), lower, upper); + else + // no values, but we still have the label + factor = + db_accessor_.vertices_count(logical_op.label(), logical_op.property()); + + // if we failed to take either bound from the op into account, then apply + // the filtering constant to the factor + if ((logical_op.upper_bound() && !upper) || + (logical_op.lower_bound() && !lower)) + factor *= CardParam::kFilter; + + cardinality_ *= factor; + + // ScanAll performs some work for every element that is produced + IncrementCost(CostParam::MakeScanAllByLabelPropertyRange); + return true; +} + +// For the given op first increments the cardinality and then cost. +#define POST_VISIT_CARD_FIRST(NAME) \ + bool CostEstimator::PostVisit(NAME &) { \ + cardinality_ *= CardParam::k##NAME; \ + IncrementCost(CostParam::k##NAME); \ + return true; \ } -POST_VISIT(Filter, kFilter) -POST_VISIT(ExpandUniquenessFilter, kExpandUniquenessFilter); -POST_VISIT(ExpandUniquenessFilter, kExpandUniquenessFilter); +POST_VISIT_CARD_FIRST(Expand); +POST_VISIT_CARD_FIRST(ExpandVariable); +POST_VISIT_CARD_FIRST(ExpandBreadthFirst); -#undef POST_VISIT +#undef POST_VISIT_CARD_FIRST + +// For the given op first increments the cost and then cardinality. +#define POST_VISIT_COST_FIRST(LOGICAL_OP, PARAM_NAME) \ + bool CostEstimator::PostVisit(LOGICAL_OP &) { \ + IncrementCost(CostParam::PARAM_NAME); \ + cardinality_ *= CardParam::PARAM_NAME; \ + return true; \ + } + +POST_VISIT_COST_FIRST(Filter, kFilter) +POST_VISIT_COST_FIRST(ExpandUniquenessFilter, + kExpandUniquenessFilter); +POST_VISIT_COST_FIRST(ExpandUniquenessFilter, + kExpandUniquenessFilter); + +#undef POST_VISIT_COST_FIRST bool CostEstimator::PostVisit(Unwind &unwind) { // Unwind cost depends more on the number of lists that get unwound diff --git a/src/query/plan/cost_estimator.hpp b/src/query/plan/cost_estimator.hpp index 9f92ef750..d4733c7e2 100644 --- a/src/query/plan/cost_estimator.hpp +++ b/src/query/plan/cost_estimator.hpp @@ -5,36 +5,44 @@ namespace query::plan { /** - * @brief: Query plan execution time cost estimator, - * for comparing and choosing optimal execution plans. + * Query plan execution time cost estimator, for comparing and choosing optimal + * execution plans. * - * In Cypher the write part of the query always executes in - * the same cardinality. It is not allowed to execute a write - * operation before all the expansion for that query part - * (WITH splits a query into parts) have executed. + * In Cypher the write part of the query always executes in the same + * cardinality. It is not allowed to execute a write operation before all the + * expansion for that query part (WITH splits a query into parts) have executed. + * For that reason cost estimation comes down to cardinality estimation for the + * read parts of the query, and their expansion. We want to compare different + * plans and try to figure out which has the optimal organization of scans, + * expansions and filters. * - * Note that expansions and filtering can also happen during - * Merge, which is a write operation. We let that get evaluated - * like all other cardinality influencing ops. Also, Merge - * cardinality modification should be contained (it can never - * reduce it's input cardinality), but since Merge always happens - * after the read part, and can't be reoredered, we can ignore - * that. + * Note that expansions and filtering can also happen during Merge, which is a + * write operation. We let that get evaluated like all other cardinality + * influencing ops. Also, Merge cardinality modification should be contained (it + * can never reduce it's input cardinality), but since Merge always happens + * after the read part, and can't be reoredered, we can ignore that. * - * Limiting and accumulating (Aggregate, OrderBy, Accumulate) - * operations are cardinality - * modifiers that always execute at the end of the - * query part. Their cardinality influence is irrelevant - * because they generally execute the same for all plans - * for a single query part, and query part reordering is - * not allowed. + * Limiting and accumulating (Aggregate, OrderBy, Accumulate) operations are + * cardinality modifiers that always execute at the end of the query part. Their + * cardinality influence is irrelevant because they execute the same + * for all plans for a single query part, and query part reordering is not + * allowed. + * + * This kind of cost estimation can only be used for comparing logical plans. + * It's aim is to estimate cost(A) to be less then cost(B) in every case where + * actual query execution for plan A is less then that of plan B. It can NOT be + * used to estimate how MUCH execution between A and B will differ. */ class CostEstimator : public HierarchicalLogicalOperatorVisitor { public: struct CostParam { static constexpr double kScanAll{1.0}; static constexpr double kScanAllByLabel{1.1}; + static constexpr double MakeScanAllByLabelPropertyValue{1.1}; + static constexpr double MakeScanAllByLabelPropertyRange{1.1}; static constexpr double kExpand{2.0}; + static constexpr double kExpandVariable{3.0}; + static constexpr double kExpandBreadthFirst{5.0}; static constexpr double kFilter{1.5}; static constexpr double kExpandUniquenessFilter{1.5}; static constexpr double kUnwind{1.3}; @@ -42,6 +50,8 @@ class CostEstimator : public HierarchicalLogicalOperatorVisitor { struct CardParam { static constexpr double kExpand{3.0}; + static constexpr double kExpandVariable{9.0}; + static constexpr double kExpandBreadthFirst{8.0}; static constexpr double kFilter{0.25}; static constexpr double kExpandUniquenessFilter{0.95}; }; @@ -58,7 +68,11 @@ class CostEstimator : public HierarchicalLogicalOperatorVisitor { bool PostVisit(ScanAll &) override; bool PostVisit(ScanAllByLabel &scan_all_by_label) override; + bool PostVisit(ScanAllByLabelPropertyValue &logical_op) override; + bool PostVisit(ScanAllByLabelPropertyRange &logical_op) override; bool PostVisit(Expand &) override; + bool PostVisit(ExpandVariable &) override; + bool PostVisit(ExpandBreadthFirst &) override; bool PostVisit(Filter &) override; bool PostVisit(ExpandUniquenessFilter &) override; bool PostVisit(ExpandUniquenessFilter &) override; diff --git a/src/query/typed_value.cpp b/src/query/typed_value.cpp index ef965cb0b..de8c7512b 100644 --- a/src/query/typed_value.cpp +++ b/src/query/typed_value.cpp @@ -265,6 +265,20 @@ bool TypedValue::IsNumeric() const { return type() == TypedValue::Type::Int || type() == TypedValue::Type::Double; } +bool TypedValue::IsPropertyValue() const { + switch (type()) { + case Type::Null: + case Type::Bool: + case Type::Int: + case Type::Double: + case Type::String: + case Type::List: + return true; + default: + return false; + } +} + std::ostream &operator<<(std::ostream &os, const TypedValue::Type type) { switch (type) { case TypedValue::Type::Null: diff --git a/src/query/typed_value.hpp b/src/query/typed_value.hpp index dc1b1e942..124ebc719 100644 --- a/src/query/typed_value.hpp +++ b/src/query/typed_value.hpp @@ -140,6 +140,10 @@ class TypedValue : public TotalOrdering { * an integer or double */ bool IsNumeric() const; + /** Convenience function for checking if this TypedValue can be converted into + * PropertyValue */ + bool IsPropertyValue() const; + friend std::ostream &operator<<(std::ostream &stream, const TypedValue &prop); private: diff --git a/tests/unit/query_cost_estimator.cpp b/tests/unit/query_cost_estimator.cpp index fd96a7b0f..00e90c82e 100644 --- a/tests/unit/query_cost_estimator.cpp +++ b/tests/unit/query_cost_estimator.cpp @@ -24,6 +24,8 @@ class QueryCostEstimator : public ::testing::Test { protected: Dbms dbms; std::unique_ptr dba = dbms.active(); + GraphDbTypes::Label label = dba->label("label"); + GraphDbTypes::Property property = dba->property("property"); // we incrementally build the logical operator plan // start it off with Once @@ -33,18 +35,26 @@ class QueryCostEstimator : public ::testing::Test { SymbolTable symbol_table_; int symbol_count = 0; + void SetUp() { + // create the index in the current db accessor and then swap it to a new one + dba->BuildIndex(label, property); + auto new_dba = dbms.active(); + dba.swap(new_dba); + } + Symbol NextSymbol() { return symbol_table_.CreateSymbol("Symbol" + std::to_string(symbol_count++), true); } - /** Adds the given number of vertices to the DB, which - * the given number is labeled with the given label */ - void AddVertices(int vertex_count, GraphDbTypes::Label label, - int labeled_count) { + /** Adds the given number of vertices to the DB, of which + * the given numbers are labeled and have a property set. */ + void AddVertices(int vertex_count, int labeled_count, + int property_count = 0) { for (int i = 0; i < vertex_count; i++) { auto vertex = dba->insert_vertex(); if (i < labeled_count) vertex.add_label(label); + if (i < property_count) vertex.PropsSet(property, i); } dba->advance_command(); @@ -60,6 +70,18 @@ class QueryCostEstimator : public ::testing::Test { void MakeOp(TArgs... args) { last_op_ = std::make_shared(args...); } + + template + Expression *Literal(TValue value) { + return storage_.Create(value); + } + + auto InclusiveBound(int bound) { + return std::experimental::make_optional( + utils::MakeBoundInclusive(Literal(bound))); + }; + + const std::experimental::nullopt_t nullopt = std::experimental::nullopt; }; // multiply with 1 to avoid linker error (possibly fixed in CLang >= 3.81) @@ -68,27 +90,84 @@ class QueryCostEstimator : public ::testing::Test { TEST_F(QueryCostEstimator, Once) { EXPECT_COST(0); } TEST_F(QueryCostEstimator, ScanAll) { - AddVertices(100, dba->label("Label"), 30); + AddVertices(100, 30, 20); MakeOp(last_op_, NextSymbol()); EXPECT_COST(100 * CostParam::kScanAll); } TEST_F(QueryCostEstimator, ScanAllByLabelCardinality) { - GraphDbTypes::Label label = dba->label("Label"); - AddVertices(100, label, 30); + AddVertices(100, 30, 20); MakeOp(last_op_, NextSymbol(), label); EXPECT_COST(30 * CostParam::kScanAllByLabel); } -TEST_F(QueryCostEstimator, ExpandCardinality) { +TEST_F(QueryCostEstimator, ScanAllByLabelPropertyValueLiteral) { + AddVertices(100, 30, 20); + MakeOp(last_op_, NextSymbol(), label, property, + Literal(12)); + EXPECT_COST(1 * CostParam::MakeScanAllByLabelPropertyValue); +} + +TEST_F(QueryCostEstimator, ScanAllByLabelPropertyValueExpr) { + AddVertices(100, 30, 20); + MakeOp( + last_op_, NextSymbol(), label, property, + // once we make expression const-folding this test case will fail + storage_.Create(Literal(12))); + EXPECT_COST(20 * CardParam::kFilter * + CostParam::MakeScanAllByLabelPropertyValue); +} + +TEST_F(QueryCostEstimator, ScanAllByLabelPropertyRangeUpper) { + AddVertices(100, 30, 20); + MakeOp(last_op_, NextSymbol(), label, property, + nullopt, InclusiveBound(12)); + // cardinality estimation is exact for very small indexes + EXPECT_COST(13 * CostParam::MakeScanAllByLabelPropertyRange); +} + +TEST_F(QueryCostEstimator, ScanAllByLabelPropertyRangeLower) { + AddVertices(100, 30, 20); + MakeOp(last_op_, NextSymbol(), label, property, + InclusiveBound(17), nullopt); + // cardinality estimation is exact for very small indexes + EXPECT_COST(3 * CostParam::MakeScanAllByLabelPropertyRange); +} + +TEST_F(QueryCostEstimator, ScanAllByLabelPropertyRangeNonLiteral) { + AddVertices(100, 30, 20); + auto bound = std::experimental::make_optional( + utils::MakeBoundInclusive(static_cast( + storage_.Create(Literal(12))))); + MakeOp(last_op_, NextSymbol(), label, property, + bound, nullopt); + EXPECT_COST(20 * CardParam::kFilter * + CostParam::MakeScanAllByLabelPropertyRange); +} + +TEST_F(QueryCostEstimator, Expand) { MakeOp(NextSymbol(), NextSymbol(), EdgeAtom::Direction::IN, last_op_, NextSymbol(), false, false); EXPECT_COST(CardParam::kExpand * CostParam::kExpand); } -// helper for testing an operations cost and cardinality -// only for operations that first increment cost, then modify cardinality -// intentially a macro (instead of function) for better test feedback +TEST_F(QueryCostEstimator, ExpandVariable) { + MakeOp(NextSymbol(), NextSymbol(), EdgeAtom::Direction::IN, + nullptr, nullptr, last_op_, NextSymbol(), false, + false); + EXPECT_COST(CardParam::kExpandVariable * CostParam::kExpandVariable); +} + +TEST_F(QueryCostEstimator, ExpandBreadthFirst) { + MakeOp( + NextSymbol(), NextSymbol(), EdgeAtom::Direction::IN, Literal(3), + NextSymbol(), NextSymbol(), Literal(true), last_op_, NextSymbol(), false); + EXPECT_COST(CardParam::kExpandBreadthFirst * CostParam::kExpandBreadthFirst); +} + +// Helper for testing an operations cost and cardinality. +// Only for operations that first increment cost, then modify cardinality. +// Intentially a macro (instead of function) for better test feedback. #define TEST_OP(OP, OP_COST_PARAM, OP_CARD_PARAM) \ OP; \ EXPECT_COST(OP_COST_PARAM); \ @@ -96,8 +175,8 @@ TEST_F(QueryCostEstimator, ExpandCardinality) { EXPECT_COST(OP_COST_PARAM + OP_CARD_PARAM * OP_COST_PARAM); TEST_F(QueryCostEstimator, Filter) { - TEST_OP(MakeOp(last_op_, storage_.Create(true)), - CostParam::kFilter, CardParam::kFilter); + TEST_OP(MakeOp(last_op_, Literal(true)), CostParam::kFilter, + CardParam::kFilter); } TEST_F(QueryCostEstimator, ExpandUniquenessFilter) {