CostEstimator - adding new ScanAll and Expand ops

Summary:
Now all ScanAll and Expand ops are covered by the cost estimator. For ScanAll with indices cost estimation is pretty good, for new Expand ops it is tragically bad (Expand to the power of expansion depth, plus arbitrary filtering). Static cost estimation is wrong wrong wrong.

Currently cost estimation of even trivial plans that use indices is wrong because the planner leaves filtering expressions that are implicitly handled by the index in the operator tree, IIRC. Tasking Teon to revise this, even though I'm not sure how bad an influence this has on cost estimation and it's use in plan choosing.

Reviewers: mislav.bradac, teon.banek, buda

Reviewed By: mislav.bradac

Subscribers: pullbot

Differential Revision: https://phabricator.memgraph.io/D633
This commit is contained in:
florijan 2017-08-03 14:18:19 +02:00
parent 6c22caa80e
commit bce4407ebe
5 changed files with 242 additions and 48 deletions

View File

@ -1,3 +1,5 @@
#include <experimental/optional>
#include "cost_estimator.hpp"
namespace query::plan {
@ -11,31 +13,112 @@ bool CostEstimator::PostVisit(ScanAll &) {
bool CostEstimator::PostVisit(ScanAllByLabel &scan_all_by_label) {
cardinality_ *= db_accessor_.vertices_count(scan_all_by_label.label());
// ScanAllByLabel performs some work for every element that is produced
// ScanAll performs some work for every element that is produced
IncrementCost(CostParam::kScanAllByLabel);
return true;
}
bool CostEstimator::PostVisit(Expand &) {
cardinality_ *= CardParam::kExpand;
// Expand performs some work for every expansion
IncrementCost(CostParam::kExpand);
bool CostEstimator::PostVisit(ScanAllByLabelPropertyValue &logical_op) {
// this cardinality estimation depends on the property value (expression).
// if it's a literal (const) we can evaluate cardinality exactly, otherwise
// we estimate
std::experimental::optional<PropertyValue> property_value =
std::experimental::nullopt;
if (auto *literal = dynamic_cast<PrimitiveLiteral *>(logical_op.expression()))
if (literal->value_.IsPropertyValue())
property_value =
std::experimental::optional<PropertyValue>(literal->value_);
double factor = 1.0;
if (property_value)
// get the exact influence based on ScanAll(label, property, value)
factor = db_accessor_.vertices_count(
logical_op.label(), logical_op.property(), property_value.value());
else
// estimate the influence as ScanAll(label, property) * filtering
factor =
db_accessor_.vertices_count(logical_op.label(), logical_op.property()) *
CardParam::kFilter;
cardinality_ *= factor;
// ScanAll performs some work for every element that is produced
IncrementCost(CostParam::MakeScanAllByLabelPropertyValue);
return true;
}
// for the given op first increments the cost and then cardinality
#define POST_VISIT(LOGICAL_OP, PARAM_NAME) \
bool CostEstimator::PostVisit(LOGICAL_OP &) { \
IncrementCost(CostParam::PARAM_NAME); \
cardinality_ *= CardParam::PARAM_NAME; \
return true; \
namespace {
// converts an optional ScanAll range bound into a property value
// if the bound is present and is a literal expression convertible to
// a property value. otherwise returns nullopt
std::experimental::optional<utils::Bound<PropertyValue>> BoundToPropertyValue(
std::experimental::optional<ScanAllByLabelPropertyRange::Bound> bound) {
if (bound)
if (auto *literal = dynamic_cast<PrimitiveLiteral *>(bound->value()))
return std::experimental::make_optional(
utils::Bound<PropertyValue>(literal->value_, bound->type()));
return std::experimental::nullopt;
}
}
bool CostEstimator::PostVisit(ScanAllByLabelPropertyRange &logical_op) {
// this cardinality estimation depends on Bound expressions.
// if they are literals we can evaluate cardinality properly
auto lower = BoundToPropertyValue(logical_op.lower_bound());
auto upper = BoundToPropertyValue(logical_op.upper_bound());
int64_t factor = 1;
if (upper || lower)
// if we have either Bound<PropertyValue>, use the value index
factor = db_accessor_.vertices_count(logical_op.label(),
logical_op.property(), lower, upper);
else
// no values, but we still have the label
factor =
db_accessor_.vertices_count(logical_op.label(), logical_op.property());
// if we failed to take either bound from the op into account, then apply
// the filtering constant to the factor
if ((logical_op.upper_bound() && !upper) ||
(logical_op.lower_bound() && !lower))
factor *= CardParam::kFilter;
cardinality_ *= factor;
// ScanAll performs some work for every element that is produced
IncrementCost(CostParam::MakeScanAllByLabelPropertyRange);
return true;
}
// For the given op first increments the cardinality and then cost.
#define POST_VISIT_CARD_FIRST(NAME) \
bool CostEstimator::PostVisit(NAME &) { \
cardinality_ *= CardParam::k##NAME; \
IncrementCost(CostParam::k##NAME); \
return true; \
}
POST_VISIT(Filter, kFilter)
POST_VISIT(ExpandUniquenessFilter<VertexAccessor>, kExpandUniquenessFilter);
POST_VISIT(ExpandUniquenessFilter<EdgeAccessor>, kExpandUniquenessFilter);
POST_VISIT_CARD_FIRST(Expand);
POST_VISIT_CARD_FIRST(ExpandVariable);
POST_VISIT_CARD_FIRST(ExpandBreadthFirst);
#undef POST_VISIT
#undef POST_VISIT_CARD_FIRST
// For the given op first increments the cost and then cardinality.
#define POST_VISIT_COST_FIRST(LOGICAL_OP, PARAM_NAME) \
bool CostEstimator::PostVisit(LOGICAL_OP &) { \
IncrementCost(CostParam::PARAM_NAME); \
cardinality_ *= CardParam::PARAM_NAME; \
return true; \
}
POST_VISIT_COST_FIRST(Filter, kFilter)
POST_VISIT_COST_FIRST(ExpandUniquenessFilter<VertexAccessor>,
kExpandUniquenessFilter);
POST_VISIT_COST_FIRST(ExpandUniquenessFilter<EdgeAccessor>,
kExpandUniquenessFilter);
#undef POST_VISIT_COST_FIRST
bool CostEstimator::PostVisit(Unwind &unwind) {
// Unwind cost depends more on the number of lists that get unwound

View File

@ -5,36 +5,44 @@
namespace query::plan {
/**
* @brief: Query plan execution time cost estimator,
* for comparing and choosing optimal execution plans.
* Query plan execution time cost estimator, for comparing and choosing optimal
* execution plans.
*
* In Cypher the write part of the query always executes in
* the same cardinality. It is not allowed to execute a write
* operation before all the expansion for that query part
* (WITH splits a query into parts) have executed.
* In Cypher the write part of the query always executes in the same
* cardinality. It is not allowed to execute a write operation before all the
* expansion for that query part (WITH splits a query into parts) have executed.
* For that reason cost estimation comes down to cardinality estimation for the
* read parts of the query, and their expansion. We want to compare different
* plans and try to figure out which has the optimal organization of scans,
* expansions and filters.
*
* Note that expansions and filtering can also happen during
* Merge, which is a write operation. We let that get evaluated
* like all other cardinality influencing ops. Also, Merge
* cardinality modification should be contained (it can never
* reduce it's input cardinality), but since Merge always happens
* after the read part, and can't be reoredered, we can ignore
* that.
* Note that expansions and filtering can also happen during Merge, which is a
* write operation. We let that get evaluated like all other cardinality
* influencing ops. Also, Merge cardinality modification should be contained (it
* can never reduce it's input cardinality), but since Merge always happens
* after the read part, and can't be reoredered, we can ignore that.
*
* Limiting and accumulating (Aggregate, OrderBy, Accumulate)
* operations are cardinality
* modifiers that always execute at the end of the
* query part. Their cardinality influence is irrelevant
* because they generally execute the same for all plans
* for a single query part, and query part reordering is
* not allowed.
* Limiting and accumulating (Aggregate, OrderBy, Accumulate) operations are
* cardinality modifiers that always execute at the end of the query part. Their
* cardinality influence is irrelevant because they execute the same
* for all plans for a single query part, and query part reordering is not
* allowed.
*
* This kind of cost estimation can only be used for comparing logical plans.
* It's aim is to estimate cost(A) to be less then cost(B) in every case where
* actual query execution for plan A is less then that of plan B. It can NOT be
* used to estimate how MUCH execution between A and B will differ.
*/
class CostEstimator : public HierarchicalLogicalOperatorVisitor {
public:
struct CostParam {
static constexpr double kScanAll{1.0};
static constexpr double kScanAllByLabel{1.1};
static constexpr double MakeScanAllByLabelPropertyValue{1.1};
static constexpr double MakeScanAllByLabelPropertyRange{1.1};
static constexpr double kExpand{2.0};
static constexpr double kExpandVariable{3.0};
static constexpr double kExpandBreadthFirst{5.0};
static constexpr double kFilter{1.5};
static constexpr double kExpandUniquenessFilter{1.5};
static constexpr double kUnwind{1.3};
@ -42,6 +50,8 @@ class CostEstimator : public HierarchicalLogicalOperatorVisitor {
struct CardParam {
static constexpr double kExpand{3.0};
static constexpr double kExpandVariable{9.0};
static constexpr double kExpandBreadthFirst{8.0};
static constexpr double kFilter{0.25};
static constexpr double kExpandUniquenessFilter{0.95};
};
@ -58,7 +68,11 @@ class CostEstimator : public HierarchicalLogicalOperatorVisitor {
bool PostVisit(ScanAll &) override;
bool PostVisit(ScanAllByLabel &scan_all_by_label) override;
bool PostVisit(ScanAllByLabelPropertyValue &logical_op) override;
bool PostVisit(ScanAllByLabelPropertyRange &logical_op) override;
bool PostVisit(Expand &) override;
bool PostVisit(ExpandVariable &) override;
bool PostVisit(ExpandBreadthFirst &) override;
bool PostVisit(Filter &) override;
bool PostVisit(ExpandUniquenessFilter<VertexAccessor> &) override;
bool PostVisit(ExpandUniquenessFilter<EdgeAccessor> &) override;

View File

@ -265,6 +265,20 @@ bool TypedValue::IsNumeric() const {
return type() == TypedValue::Type::Int || type() == TypedValue::Type::Double;
}
bool TypedValue::IsPropertyValue() const {
switch (type()) {
case Type::Null:
case Type::Bool:
case Type::Int:
case Type::Double:
case Type::String:
case Type::List:
return true;
default:
return false;
}
}
std::ostream &operator<<(std::ostream &os, const TypedValue::Type type) {
switch (type) {
case TypedValue::Type::Null:

View File

@ -140,6 +140,10 @@ class TypedValue : public TotalOrdering<TypedValue, TypedValue, TypedValue> {
* an integer or double */
bool IsNumeric() const;
/** Convenience function for checking if this TypedValue can be converted into
* PropertyValue */
bool IsPropertyValue() const;
friend std::ostream &operator<<(std::ostream &stream, const TypedValue &prop);
private:

View File

@ -24,6 +24,8 @@ class QueryCostEstimator : public ::testing::Test {
protected:
Dbms dbms;
std::unique_ptr<GraphDbAccessor> dba = dbms.active();
GraphDbTypes::Label label = dba->label("label");
GraphDbTypes::Property property = dba->property("property");
// we incrementally build the logical operator plan
// start it off with Once
@ -33,18 +35,26 @@ class QueryCostEstimator : public ::testing::Test {
SymbolTable symbol_table_;
int symbol_count = 0;
void SetUp() {
// create the index in the current db accessor and then swap it to a new one
dba->BuildIndex(label, property);
auto new_dba = dbms.active();
dba.swap(new_dba);
}
Symbol NextSymbol() {
return symbol_table_.CreateSymbol("Symbol" + std::to_string(symbol_count++),
true);
}
/** Adds the given number of vertices to the DB, which
* the given number is labeled with the given label */
void AddVertices(int vertex_count, GraphDbTypes::Label label,
int labeled_count) {
/** Adds the given number of vertices to the DB, of which
* the given numbers are labeled and have a property set. */
void AddVertices(int vertex_count, int labeled_count,
int property_count = 0) {
for (int i = 0; i < vertex_count; i++) {
auto vertex = dba->insert_vertex();
if (i < labeled_count) vertex.add_label(label);
if (i < property_count) vertex.PropsSet(property, i);
}
dba->advance_command();
@ -60,6 +70,18 @@ class QueryCostEstimator : public ::testing::Test {
void MakeOp(TArgs... args) {
last_op_ = std::make_shared<TLogicalOperator>(args...);
}
template <typename TValue>
Expression *Literal(TValue value) {
return storage_.Create<PrimitiveLiteral>(value);
}
auto InclusiveBound(int bound) {
return std::experimental::make_optional(
utils::MakeBoundInclusive(Literal(bound)));
};
const std::experimental::nullopt_t nullopt = std::experimental::nullopt;
};
// multiply with 1 to avoid linker error (possibly fixed in CLang >= 3.81)
@ -68,27 +90,84 @@ class QueryCostEstimator : public ::testing::Test {
TEST_F(QueryCostEstimator, Once) { EXPECT_COST(0); }
TEST_F(QueryCostEstimator, ScanAll) {
AddVertices(100, dba->label("Label"), 30);
AddVertices(100, 30, 20);
MakeOp<ScanAll>(last_op_, NextSymbol());
EXPECT_COST(100 * CostParam::kScanAll);
}
TEST_F(QueryCostEstimator, ScanAllByLabelCardinality) {
GraphDbTypes::Label label = dba->label("Label");
AddVertices(100, label, 30);
AddVertices(100, 30, 20);
MakeOp<ScanAllByLabel>(last_op_, NextSymbol(), label);
EXPECT_COST(30 * CostParam::kScanAllByLabel);
}
TEST_F(QueryCostEstimator, ExpandCardinality) {
TEST_F(QueryCostEstimator, ScanAllByLabelPropertyValueLiteral) {
AddVertices(100, 30, 20);
MakeOp<ScanAllByLabelPropertyValue>(last_op_, NextSymbol(), label, property,
Literal(12));
EXPECT_COST(1 * CostParam::MakeScanAllByLabelPropertyValue);
}
TEST_F(QueryCostEstimator, ScanAllByLabelPropertyValueExpr) {
AddVertices(100, 30, 20);
MakeOp<ScanAllByLabelPropertyValue>(
last_op_, NextSymbol(), label, property,
// once we make expression const-folding this test case will fail
storage_.Create<UnaryPlusOperator>(Literal(12)));
EXPECT_COST(20 * CardParam::kFilter *
CostParam::MakeScanAllByLabelPropertyValue);
}
TEST_F(QueryCostEstimator, ScanAllByLabelPropertyRangeUpper) {
AddVertices(100, 30, 20);
MakeOp<ScanAllByLabelPropertyRange>(last_op_, NextSymbol(), label, property,
nullopt, InclusiveBound(12));
// cardinality estimation is exact for very small indexes
EXPECT_COST(13 * CostParam::MakeScanAllByLabelPropertyRange);
}
TEST_F(QueryCostEstimator, ScanAllByLabelPropertyRangeLower) {
AddVertices(100, 30, 20);
MakeOp<ScanAllByLabelPropertyRange>(last_op_, NextSymbol(), label, property,
InclusiveBound(17), nullopt);
// cardinality estimation is exact for very small indexes
EXPECT_COST(3 * CostParam::MakeScanAllByLabelPropertyRange);
}
TEST_F(QueryCostEstimator, ScanAllByLabelPropertyRangeNonLiteral) {
AddVertices(100, 30, 20);
auto bound = std::experimental::make_optional(
utils::MakeBoundInclusive(static_cast<Expression *>(
storage_.Create<UnaryPlusOperator>(Literal(12)))));
MakeOp<ScanAllByLabelPropertyRange>(last_op_, NextSymbol(), label, property,
bound, nullopt);
EXPECT_COST(20 * CardParam::kFilter *
CostParam::MakeScanAllByLabelPropertyRange);
}
TEST_F(QueryCostEstimator, Expand) {
MakeOp<Expand>(NextSymbol(), NextSymbol(), EdgeAtom::Direction::IN, last_op_,
NextSymbol(), false, false);
EXPECT_COST(CardParam::kExpand * CostParam::kExpand);
}
// helper for testing an operations cost and cardinality
// only for operations that first increment cost, then modify cardinality
// intentially a macro (instead of function) for better test feedback
TEST_F(QueryCostEstimator, ExpandVariable) {
MakeOp<ExpandVariable>(NextSymbol(), NextSymbol(), EdgeAtom::Direction::IN,
nullptr, nullptr, last_op_, NextSymbol(), false,
false);
EXPECT_COST(CardParam::kExpandVariable * CostParam::kExpandVariable);
}
TEST_F(QueryCostEstimator, ExpandBreadthFirst) {
MakeOp<ExpandBreadthFirst>(
NextSymbol(), NextSymbol(), EdgeAtom::Direction::IN, Literal(3),
NextSymbol(), NextSymbol(), Literal(true), last_op_, NextSymbol(), false);
EXPECT_COST(CardParam::kExpandBreadthFirst * CostParam::kExpandBreadthFirst);
}
// Helper for testing an operations cost and cardinality.
// Only for operations that first increment cost, then modify cardinality.
// Intentially a macro (instead of function) for better test feedback.
#define TEST_OP(OP, OP_COST_PARAM, OP_CARD_PARAM) \
OP; \
EXPECT_COST(OP_COST_PARAM); \
@ -96,8 +175,8 @@ TEST_F(QueryCostEstimator, ExpandCardinality) {
EXPECT_COST(OP_COST_PARAM + OP_CARD_PARAM * OP_COST_PARAM);
TEST_F(QueryCostEstimator, Filter) {
TEST_OP(MakeOp<Filter>(last_op_, storage_.Create<PrimitiveLiteral>(true)),
CostParam::kFilter, CardParam::kFilter);
TEST_OP(MakeOp<Filter>(last_op_, Literal(true)), CostParam::kFilter,
CardParam::kFilter);
}
TEST_F(QueryCostEstimator, ExpandUniquenessFilter) {