memgraph/src/query/plan/cost_estimator.hpp
Teon Banek 1fd9a72e10 Generate Load functions from LCP as top level
Summary: Depends on D1596

Reviewers: mtomic, msantl

Reviewed By: msantl

Subscribers: pullbot

Differential Revision: https://phabricator.memgraph.io/D1601
2018-09-28 10:34:20 +02:00

253 lines
9.7 KiB
C++

#pragma once
#include "query/frontend/ast/ast.hpp"
#include "query/parameters.hpp"
#include "query/plan/operator.hpp"
#include "query/typed_value.hpp"
namespace query::plan {
/**
* Query plan execution time cost estimator, for comparing and choosing optimal
* execution plans.
*
* In Cypher the write part of the query always executes in the same
* cardinality. It is not allowed to execute a write operation before all the
* expansion for that query part (WITH splits a query into parts) have executed.
* For that reason cost estimation comes down to cardinality estimation for the
* read parts of the query, and their expansion. We want to compare different
* plans and try to figure out which has the optimal organization of scans,
* expansions and filters.
*
* Note that expansions and filtering can also happen during Merge, which is a
* write operation. We let that get evaluated like all other cardinality
* influencing ops. Also, Merge cardinality modification should be contained (it
* can never reduce it's input cardinality), but since Merge always happens
* after the read part, and can't be reoredered, we can ignore that.
*
* Limiting and accumulating (Aggregate, OrderBy, Accumulate) operations are
* cardinality modifiers that always execute at the end of the query part. Their
* cardinality influence is irrelevant because they execute the same
* for all plans for a single query part, and query part reordering is not
* allowed.
*
* This kind of cost estimation can only be used for comparing logical plans.
* It's aim is to estimate cost(A) to be less then cost(B) in every case where
* actual query execution for plan A is less then that of plan B. It can NOT be
* used to estimate how MUCH execution between A and B will differ.
*/
template <class TDbAccessor>
class CostEstimator : public HierarchicalLogicalOperatorVisitor {
public:
struct CostParam {
static constexpr double kScanAll{1.0};
static constexpr double kScanAllByLabel{1.1};
static constexpr double MakeScanAllByLabelPropertyValue{1.1};
static constexpr double MakeScanAllByLabelPropertyRange{1.1};
static constexpr double kExpand{2.0};
static constexpr double kExpandVariable{3.0};
static constexpr double kFilter{1.5};
static constexpr double kExpandUniquenessFilter{1.5};
static constexpr double kUnwind{1.3};
};
struct CardParam {
static constexpr double kExpand{3.0};
static constexpr double kExpandVariable{9.0};
static constexpr double kFilter{0.25};
static constexpr double kExpandUniquenessFilter{0.95};
};
struct MiscParam {
static constexpr double kUnwindNoLiteral{10.0};
};
using HierarchicalLogicalOperatorVisitor::PostVisit;
using HierarchicalLogicalOperatorVisitor::PreVisit;
CostEstimator(const TDbAccessor &db_accessor, const Parameters &parameters)
: db_accessor_(db_accessor), parameters(parameters) {}
bool PostVisit(ScanAll &) override {
cardinality_ *= db_accessor_.VerticesCount();
// ScanAll performs some work for every element that is produced
IncrementCost(CostParam::kScanAll);
return true;
}
bool PostVisit(ScanAllByLabel &scan_all_by_label) override {
cardinality_ *= db_accessor_.VerticesCount(scan_all_by_label.label_);
// ScanAll performs some work for every element that is produced
IncrementCost(CostParam::kScanAllByLabel);
return true;
}
bool PostVisit(ScanAllByLabelPropertyValue &logical_op) override {
// This cardinality estimation depends on the property value (expression).
// If it's a constant, we can evaluate cardinality exactly, otherwise
// we estimate
auto property_value = ConstPropertyValue(logical_op.expression_);
double factor = 1.0;
if (property_value)
// get the exact influence based on ScanAll(label, property, value)
factor = db_accessor_.VerticesCount(
logical_op.label_, logical_op.property_, property_value.value());
else
// estimate the influence as ScanAll(label, property) * filtering
factor = db_accessor_.VerticesCount(logical_op.label_,
logical_op.property_) *
CardParam::kFilter;
cardinality_ *= factor;
// ScanAll performs some work for every element that is produced
IncrementCost(CostParam::MakeScanAllByLabelPropertyValue);
return true;
}
bool PostVisit(ScanAllByLabelPropertyRange &logical_op) override {
// this cardinality estimation depends on Bound expressions.
// if they are literals we can evaluate cardinality properly
auto lower = BoundToPropertyValue(logical_op.lower_bound_);
auto upper = BoundToPropertyValue(logical_op.upper_bound_);
int64_t factor = 1;
if (upper || lower)
// if we have either Bound<PropertyValue>, use the value index
factor = db_accessor_.VerticesCount(logical_op.label_,
logical_op.property_, lower, upper);
else
// no values, but we still have the label
factor =
db_accessor_.VerticesCount(logical_op.label_, logical_op.property_);
// if we failed to take either bound from the op into account, then apply
// the filtering constant to the factor
if ((logical_op.upper_bound_ && !upper) ||
(logical_op.lower_bound_ && !lower))
factor *= CardParam::kFilter;
cardinality_ *= factor;
// ScanAll performs some work for every element that is produced
IncrementCost(CostParam::MakeScanAllByLabelPropertyRange);
return true;
}
// For the given op first increments the cardinality and then cost.
#define POST_VISIT_CARD_FIRST(NAME) \
bool PostVisit(NAME &) override { \
cardinality_ *= CardParam::k##NAME; \
IncrementCost(CostParam::k##NAME); \
return true; \
}
POST_VISIT_CARD_FIRST(Expand);
POST_VISIT_CARD_FIRST(ExpandVariable);
#undef POST_VISIT_CARD_FIRST
// For the given op first increments the cost and then cardinality.
#define POST_VISIT_COST_FIRST(LOGICAL_OP, PARAM_NAME) \
bool PostVisit(LOGICAL_OP &) override { \
IncrementCost(CostParam::PARAM_NAME); \
cardinality_ *= CardParam::PARAM_NAME; \
return true; \
}
POST_VISIT_COST_FIRST(Filter, kFilter)
POST_VISIT_COST_FIRST(ExpandUniquenessFilter<VertexAccessor>,
kExpandUniquenessFilter);
POST_VISIT_COST_FIRST(ExpandUniquenessFilter<EdgeAccessor>,
kExpandUniquenessFilter);
#undef POST_VISIT_COST_FIRST
bool PostVisit(Unwind &unwind) override {
// Unwind cost depends more on the number of lists that get unwound
// much less on the number of outputs
// for that reason first increment cost, then modify cardinality
IncrementCost(CostParam::kUnwind);
// try to determine how many values will be yielded by Unwind
// if the Unwind expression is a list literal, we can deduce cardinality
// exactly, otherwise we approximate
int unwind_value;
if (auto literal =
dynamic_cast<query::ListLiteral *>(unwind.input_expression_))
unwind_value = literal->elements_.size();
else
unwind_value = MiscParam::kUnwindNoLiteral;
cardinality_ *= unwind_value;
return true;
}
bool Visit(Once &) override { return true; }
bool Visit(CreateIndex &) override { return true; }
bool Visit(AuthHandler &) override { return true; }
bool Visit(CreateStream &) override { return true; }
bool Visit(DropStream &) override { return true; }
bool Visit(ShowStreams &) override { return true; }
bool Visit(StartStopStream &) override { return true; }
bool Visit(StartStopAllStreams &) override { return true; }
bool Visit(TestStream &) override { return true; }
// TODO: Cost estimate PullRemote and ProduceRemote?
auto cost() const { return cost_; }
auto cardinality() const { return cardinality_; }
private:
// cost estimation that gets accumulated as the visitor
// tours the logical plan
double cost_{0};
// cardinality estimation (how many times an operator gets executed)
// cardinality is a double to make it easier to work with
double cardinality_{1};
// accessor used for cardinality estimates in ScanAll and ScanAllByLabel
const TDbAccessor &db_accessor_;
const Parameters &parameters;
void IncrementCost(double param) { cost_ += param * cardinality_; }
// converts an optional ScanAll range bound into a property value
// if the bound is present and is a constant expression convertible to
// a property value. otherwise returns nullopt
std::experimental::optional<utils::Bound<PropertyValue>> BoundToPropertyValue(
std::experimental::optional<ScanAllByLabelPropertyRange::Bound> bound) {
if (bound) {
auto property_value = ConstPropertyValue(bound->value());
if (property_value)
return utils::Bound<PropertyValue>(*property_value, bound->type());
}
return std::experimental::nullopt;
}
// If the expression is a constant property value, it is returned. Otherwise,
// return nullopt.
std::experimental::optional<PropertyValue> ConstPropertyValue(
const Expression *expression) {
if (auto *literal = dynamic_cast<const PrimitiveLiteral *>(expression)) {
return literal->value_;
} else if (auto *param_lookup =
dynamic_cast<const ParameterLookup *>(expression)) {
return parameters.AtTokenPosition(param_lookup->token_position_);
}
return std::experimental::nullopt;
}
};
/** Returns the estimated cost of the given plan. */
template <class TDbAccessor>
double EstimatePlanCost(const TDbAccessor &db, const Parameters &parameters,
LogicalOperator &plan) {
CostEstimator<TDbAccessor> estimator(db, parameters);
plan.Accept(estimator);
return estimator.cost();
}
} // namespace query::plan