Query - plan - cost estimator basic. REPL mods

Reviewers: teon.banek, buda, mislav.bradac

Reviewed By: teon.banek, buda

Subscribers: pullbot, teon.banek

Differential Revision: https://phabricator.memgraph.io/D399
This commit is contained in:
florijan 2017-06-01 12:09:18 +02:00
parent 29041eb4d1
commit 509d5db67a
7 changed files with 298 additions and 56 deletions

View File

@ -362,6 +362,7 @@ set(memgraph_src_files
${src_dir}/query/plan/operator.cpp
${src_dir}/query/plan/rule_based_planner.cpp
${src_dir}/query/plan/variable_start_planner.cpp
${src_dir}/query/plan/cost_estimator.cpp
${src_dir}/query/frontend/semantic/symbol_generator.cpp
)
# -----------------------------------------------------------------------------

View File

@ -9,6 +9,7 @@
#include "query/frontend/semantic/symbol_generator.hpp"
#include "query/interpret/frame.hpp"
#include "query/plan/planner.hpp"
#include "query/plan/cost_estimator.hpp"
namespace query {
@ -42,6 +43,11 @@ void Interpret(const std::string &query, GraphDbAccessor &db_accessor,
auto logical_plan = plan::MakeLogicalPlan<plan::RuleBasedPlanner>(
visitor.storage(), symbol_table, &db_accessor);
// cost estimation
plan::CostEstimator cost_estimator(db_accessor);
logical_plan->Accept(cost_estimator);
double query_plan_cost_estimation = cost_estimator.cost();
// generate frame based on symbol table max_position
Frame frame(symbol_table.max_position());
@ -93,6 +99,7 @@ void Interpret(const std::string &query, GraphDbAccessor &db_accessor,
time_second(antlr_end_time, planning_end_time);
summary["query_plan_execution_time"] =
time_second(planning_end_time, execution_end_time);
summary["query_cost_estimate"] = query_plan_cost_estimation;
//
// TODO set summary['type'] based on transaction metadata
// the type can't be determined based only on top level LogicalOp

View File

@ -0,0 +1,62 @@
#include "cost_estimator.hpp"
namespace query::plan {
bool CostEstimator::PostVisit(ScanAll &) {
cardinality_ *= db_accessor_.vertices_count();
// ScanAll performs some work for every element that is produced
IncrementCost(CostParam::kScanAll);
return true;
}
bool CostEstimator::PostVisit(ScanAllByLabel &scan_all_by_label) {
cardinality_ *= db_accessor_.vertices_count(scan_all_by_label.label());
// ScanAllByLabel performs some work for every element that is produced
IncrementCost(CostParam::kScanAllByLabel);
return true;
}
bool CostEstimator::PostVisit(Expand &) {
cardinality_ *= CardParam::kExpand;
// Expand performs some work for every expansion
IncrementCost(CostParam::kExpand);
return true;
}
// for the given op first increments the cost and then cardinality
#define POST_VISIT(LOGICAL_OP, PARAM_NAME) \
bool CostEstimator::PostVisit(LOGICAL_OP &) { \
IncrementCost(CostParam::PARAM_NAME); \
cardinality_ *= CardParam::PARAM_NAME; \
return true; \
}
POST_VISIT(Filter, kFilter)
POST_VISIT(ExpandUniquenessFilter<VertexAccessor>, kExpandUniquenessFilter);
POST_VISIT(ExpandUniquenessFilter<EdgeAccessor>, kExpandUniquenessFilter);
#undef POST_VISIT
bool CostEstimator::PostVisit(Unwind &unwind) {
// Unwind cost depends more on the number of lists that get unwound
// much less on the number of outputs
// for that reason first increment cost, then modify cardinality
IncrementCost(CostParam::kUnwind);
// try to determine how many values will be yielded by Unwind
// if the Unwind expression is a list literal, we can deduce cardinality
// exactly, otherwise we approximate
int unwind_value;
if (auto literal =
dynamic_cast<query::ListLiteral *>(unwind.input_expression()))
unwind_value = literal->elements_.size();
else
unwind_value = MiscParam::kUnwindNoLiteral;
cardinality_ *= unwind_value;
return true;
}
bool CostEstimator::Visit(Once &) { return true; }
} // namespace query::plan

View File

@ -0,0 +1,87 @@
#include "logging/loggable.hpp"
#include "query/plan/operator.hpp"
#include "query/frontend/ast/ast.hpp"
#include "query/typed_value.hpp"
namespace query::plan {
/**
* @brief: Query plan execution time cost estimator,
* for comparing and choosing optimal execution plans.
*
* In Cypher the write part of the query always executes in
* the same cardinality. It is not allowed to execute a write
* operation before all the expansion for that query part
* (WITH splits a query into parts) have executed.
*
* Note that expansions and filtering can also happen during
* Merge, which is a write operation. We let that get evaluated
* like all other cardinality influencing ops. Also, Merge
* cardinality modification should be contained (it can never
* reduce it's input cardinality), but since Merge always happens
* after the read part, and can't be reoredered, we can ignore
* that.
*
* Limiting and accumulating (Aggregate, OrderBy, Accumulate)
* operations are cardinality
* modifiers that always execute at the end of the
* query part. Their cardinality influence is irrelevant
* because they generally execute the same for all plans
* for a single query part, and query part reordering is
* not allowed.
*/
class CostEstimator : public HierarchicalLogicalOperatorVisitor, Loggable {
public:
struct CostParam {
static constexpr double kScanAll{1.0};
static constexpr double kScanAllByLabel{1.1};
static constexpr double kExpand{2.0};
static constexpr double kFilter{1.5};
static constexpr double kExpandUniquenessFilter{1.5};
static constexpr double kUnwind{1.3};
};
struct CardParam {
static constexpr double kExpand{3.0};
static constexpr double kFilter{0.25};
static constexpr double kExpandUniquenessFilter{0.95};
};
struct MiscParam {
static constexpr double kUnwindNoLiteral{10.0};
};
using HierarchicalLogicalOperatorVisitor::PreVisit;
using HierarchicalLogicalOperatorVisitor::PostVisit;
CostEstimator(const GraphDbAccessor &db_accessor)
: Loggable("QueryCostEstimator"), db_accessor_(db_accessor) {}
bool PostVisit(ScanAll &) override;
bool PostVisit(ScanAllByLabel &scan_all_by_label) override;
bool PostVisit(Expand &) override;
bool PostVisit(Filter &) override;
bool PostVisit(ExpandUniquenessFilter<VertexAccessor> &) override;
bool PostVisit(ExpandUniquenessFilter<EdgeAccessor> &) override;
bool PostVisit(Unwind &unwind) override;
bool Visit(Once &) override;
auto cost() const { return cost_; }
auto cardinality() const { return cardinality_; }
private:
// cost estimation that gets accumulated as the visitor
// tours the logical plan
double cost_{0};
// cardinality estimation (how many times an operator gets executed)
// cardinality is a double to make it easier to work with
double cardinality_{1};
//
// accessor used for cardinality estimates in ScanAll and ScanAllByLabel
const GraphDbAccessor &db_accessor_;
void IncrementCost(double param) { cost_ += param * cardinality_; }
};
} // namespace query::plan

View File

@ -320,6 +320,8 @@ class ScanAllByLabel : public ScanAll {
bool Accept(HierarchicalLogicalOperatorVisitor &visitor) override;
std::unique_ptr<Cursor> MakeCursor(GraphDbAccessor &db) override;
GraphDbTypes::Label label() const { return label_; }
private:
const GraphDbTypes::Label label_;
};
@ -1221,6 +1223,8 @@ class Unwind : public LogicalOperator {
bool Accept(HierarchicalLogicalOperatorVisitor &visitor) override;
std::unique_ptr<Cursor> MakeCursor(GraphDbAccessor &db) override;
Expression *input_expression() const { return input_expression_; }
private:
const std::shared_ptr<LogicalOperator> input_;
Expression *input_expression_;

View File

@ -5,6 +5,9 @@
#include "query/interpreter.hpp"
#include "utils/random_graph_generator.hpp"
#include "logging/default.hpp"
#include "logging/streams/stdout.hpp"
void random_generate(Dbms &dbms, uint node_count, int edge_factor = 5) {
auto dba = dbms.active();
utils::RandomGraphGenerator generator(*dba);
@ -20,72 +23,23 @@ void random_generate(Dbms &dbms, uint node_count, int edge_factor = 5) {
generator.Commit();
}
void fill_db(Dbms &dbms) {
auto dba = dbms.active();
// labels
auto company = dba->label("Company");
auto person = dba->label("Person");
auto device = dba->label("Device");
// props
auto name = dba->property("name");
auto age = dba->property("age");
auto type = dba->property("type");
// vertices
auto memgraph = dba->insert_vertex();
memgraph.PropsSet(name, "Memgraph");
memgraph.add_label(company);
auto teon = dba->insert_vertex();
teon.PropsSet(name, "Teon");
teon.PropsSet(age, 26);
teon.add_label(person);
auto mislav = dba->insert_vertex();
mislav.PropsSet(name, "Mislav");
mislav.PropsSet(age, 22);
mislav.add_label(person);
auto florijan = dba->insert_vertex();
florijan.PropsSet(name, "Florijan");
florijan.PropsSet(age, 31);
florijan.add_label(person);
auto xps_15 = dba->insert_vertex();
xps_15.PropsSet(type, "PC");
xps_15.PropsSet(name, "Dell XPS 15");
xps_15.add_label(device);
// edges
dba->insert_edge(teon, memgraph, dba->edge_type("MEMBER_OF"));
dba->insert_edge(mislav, memgraph, dba->edge_type("MEMBER_OF"));
dba->insert_edge(florijan, memgraph, dba->edge_type("MEMBER_OF"));
dba->insert_edge(teon, mislav, dba->edge_type("FRIEND_OF"));
dba->insert_edge(mislav, teon, dba->edge_type("FRIEND_OF"));
dba->insert_edge(florijan, mislav, dba->edge_type("FRIEND_OF"));
dba->insert_edge(mislav, florijan, dba->edge_type("FRIEND_OF"));
dba->insert_edge(florijan, teon, dba->edge_type("FRIEND_OF"));
dba->insert_edge(teon, florijan, dba->edge_type("FRIEND_OF"));
dba->insert_edge(memgraph, xps_15, dba->edge_type("OWNS"));
dba->insert_edge(teon, xps_15, dba->edge_type("USES"));
dba->insert_edge(mislav, xps_15, dba->edge_type("USES"));
dba->insert_edge(florijan, xps_15, dba->edge_type("USES"));
dba->commit();
}
int main(int argc, char *argv[]) {
REGISTER_ARGS(argc, argv);
// parse the first cmd line argument as the count of nodes to randomly create
uint node_count = 100000;
uint node_count = 0;
if (argc > 1) {
node_count = (uint) std::stoul(argv[1]);
permanent_assert(node_count < 10000000,
"More then 10M nodes requested, that's too much");
}
// TODO switch to GFlags, once finally available
if (argc > 2) {
logging::init_sync();
logging::log->pipe(std::make_unique<Stdout>());
}
Dbms dbms;
std::cout << "Generating graph..." << std::endl;
// fill_db(dbms);

View File

@ -0,0 +1,127 @@
#include <gtest/gtest.h>
#include <memory>
#include "dbms/dbms.hpp"
#include "query/frontend/ast/ast.hpp"
#include "query/frontend/semantic/symbol_table.hpp"
#include "query/plan/cost_estimator.hpp"
#include "query/plan/operator.hpp"
#include "storage/vertex_accessor.hpp"
using namespace query;
using namespace query::plan;
using CardParam = CostEstimator::CardParam;
using CostParam = CostEstimator::CostParam;
using MiscParam = CostEstimator::MiscParam;
/** A fixture for cost estimation. Sets up the database
* and accessor (adds some vertices). Provides convenience
* functions for creating the logical plan. Note that the
* resulting plan is NOT fit for execution, only for cost
* estimation testing. */
class QueryCostEstimator : public ::testing::Test {
protected:
Dbms dbms;
std::unique_ptr<GraphDbAccessor> dba = dbms.active();
// we incrementally build the logical operator plan
// start it off with Once
std::shared_ptr<LogicalOperator> last_op_ = std::make_shared<Once>();
AstTreeStorage storage_;
SymbolTable symbol_table_;
int symbol_count = 0;
Symbol NextSymbol() {
return symbol_table_.CreateSymbol("Symbol" + std::to_string(symbol_count++),
true);
}
/** Adds the given number of vertices to the DB, which
* the given number is labeled with the given label */
void AddVertices(int vertex_count, GraphDbTypes::Label label,
int labeled_count) {
for (int i = 0; i < vertex_count; i++) {
auto vertex = dba->insert_vertex();
if (i < labeled_count) vertex.add_label(label);
}
dba->advance_command();
}
auto Cost() {
CostEstimator cost_estimator(*dba);
last_op_->Accept(cost_estimator);
return cost_estimator.cost();
}
template <typename TLogicalOperator, typename... TArgs>
void MakeOp(TArgs... args) {
last_op_ = std::make_shared<TLogicalOperator>(args...);
}
};
// multiply with 1 to avoid linker error (possibly fixed in CLang >= 3.81)
#define EXPECT_COST(COST) EXPECT_FLOAT_EQ(Cost(), 1 * COST)
TEST_F(QueryCostEstimator, Once) { EXPECT_COST(0); }
TEST_F(QueryCostEstimator, ScanAll) {
AddVertices(100, dba->label("Label"), 30);
MakeOp<ScanAll>(last_op_, NextSymbol());
EXPECT_COST(100 * CostParam::kScanAll);
}
TEST_F(QueryCostEstimator, ScanAllByLabelCardinality) {
GraphDbTypes::Label label = dba->label("Label");
AddVertices(100, label, 30);
MakeOp<ScanAllByLabel>(last_op_, NextSymbol(), label);
EXPECT_COST(30 * CostParam::kScanAllByLabel);
}
TEST_F(QueryCostEstimator, ExpandCardinality) {
MakeOp<Expand>(NextSymbol(), NextSymbol(), EdgeAtom::Direction::IN, last_op_,
NextSymbol(), false, false);
EXPECT_COST(CardParam::kExpand * CostParam::kExpand);
}
// helper for testing an operations cost and cardinality
// only for operations that first increment cost, then modify cardinality
// intentially a macro (instead of function) for better test feedback
#define TEST_OP(OP, OP_COST_PARAM, OP_CARD_PARAM) \
OP; \
EXPECT_COST(OP_COST_PARAM); \
OP; \
EXPECT_COST(OP_COST_PARAM + OP_CARD_PARAM * OP_COST_PARAM);
TEST_F(QueryCostEstimator, Filter) {
TEST_OP(MakeOp<Filter>(last_op_, storage_.Create<PrimitiveLiteral>(true)),
CostParam::kFilter, CardParam::kFilter);
}
TEST_F(QueryCostEstimator, ExpandUniquenessFilter) {
TEST_OP(MakeOp<ExpandUniquenessFilter<VertexAccessor>>(last_op_, NextSymbol(),
std::vector<Symbol>()),
CostParam::kExpandUniquenessFilter,
CardParam::kExpandUniquenessFilter);
}
TEST_F(QueryCostEstimator, UnwindLiteral) {
TEST_OP(MakeOp<query::plan::Unwind>(
last_op_, storage_.Create<ListLiteral>(
std::vector<Expression *>(7, nullptr)),
NextSymbol()),
CostParam::kUnwind, 7);
}
TEST_F(QueryCostEstimator, UnwindNoLiteral) {
TEST_OP(MakeOp<query::plan::Unwind>(last_op_, nullptr, NextSymbol()),
CostParam::kUnwind, MiscParam::kUnwindNoLiteral);
}
#undef TEST_OP
#undef EXPECT_COST
//
// TODO test cost when ScanAll, Expand, Accumulate, Limit
// vs cost for SA, Expand, Limit