diff --git a/CMakeLists.txt b/CMakeLists.txt index 78418e9bb..47b9788d2 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -362,6 +362,7 @@ set(memgraph_src_files ${src_dir}/query/plan/operator.cpp ${src_dir}/query/plan/rule_based_planner.cpp ${src_dir}/query/plan/variable_start_planner.cpp + ${src_dir}/query/plan/cost_estimator.cpp ${src_dir}/query/frontend/semantic/symbol_generator.cpp ) # ----------------------------------------------------------------------------- diff --git a/src/query/interpreter.hpp b/src/query/interpreter.hpp index c9b67a2b2..e87d5770e 100644 --- a/src/query/interpreter.hpp +++ b/src/query/interpreter.hpp @@ -9,6 +9,7 @@ #include "query/frontend/semantic/symbol_generator.hpp" #include "query/interpret/frame.hpp" #include "query/plan/planner.hpp" +#include "query/plan/cost_estimator.hpp" namespace query { @@ -42,6 +43,11 @@ void Interpret(const std::string &query, GraphDbAccessor &db_accessor, auto logical_plan = plan::MakeLogicalPlan<plan::RuleBasedPlanner>( visitor.storage(), symbol_table, &db_accessor); + // cost estimation + plan::CostEstimator cost_estimator(db_accessor); + logical_plan->Accept(cost_estimator); + double query_plan_cost_estimation = cost_estimator.cost(); + // generate frame based on symbol table max_position Frame frame(symbol_table.max_position()); @@ -93,6 +99,7 @@ void Interpret(const std::string &query, GraphDbAccessor &db_accessor, time_second(antlr_end_time, planning_end_time); summary["query_plan_execution_time"] = time_second(planning_end_time, execution_end_time); + summary["query_cost_estimate"] = query_plan_cost_estimation; // // TODO set summary['type'] based on transaction metadata // the type can't be determined based only on top level LogicalOp diff --git a/src/query/plan/cost_estimator.cpp b/src/query/plan/cost_estimator.cpp new file mode 100644 index 000000000..8c73934e6 --- /dev/null +++ b/src/query/plan/cost_estimator.cpp @@ -0,0 +1,62 @@ +#include "cost_estimator.hpp" + +namespace query::plan { + +bool CostEstimator::PostVisit(ScanAll &) { + cardinality_ *= db_accessor_.vertices_count(); + // ScanAll performs some work for every element that is produced + IncrementCost(CostParam::kScanAll); + return true; +} + +bool CostEstimator::PostVisit(ScanAllByLabel &scan_all_by_label) { + cardinality_ *= db_accessor_.vertices_count(scan_all_by_label.label()); + // ScanAllByLabel performs some work for every element that is produced + IncrementCost(CostParam::kScanAllByLabel); + return true; +} + +bool CostEstimator::PostVisit(Expand &) { + cardinality_ *= CardParam::kExpand; + // Expand performs some work for every expansion + IncrementCost(CostParam::kExpand); + return true; +} + +// for the given op first increments the cost and then cardinality +#define POST_VISIT(LOGICAL_OP, PARAM_NAME) \ + bool CostEstimator::PostVisit(LOGICAL_OP &) { \ + IncrementCost(CostParam::PARAM_NAME); \ + cardinality_ *= CardParam::PARAM_NAME; \ + return true; \ + } + +POST_VISIT(Filter, kFilter) +POST_VISIT(ExpandUniquenessFilter<VertexAccessor>, kExpandUniquenessFilter); +POST_VISIT(ExpandUniquenessFilter<EdgeAccessor>, kExpandUniquenessFilter); + +#undef POST_VISIT + +bool CostEstimator::PostVisit(Unwind &unwind) { + // Unwind cost depends more on the number of lists that get unwound + // much less on the number of outputs + // for that reason first increment cost, then modify cardinality + IncrementCost(CostParam::kUnwind); + + // try to determine how many values will be yielded by Unwind + // if the Unwind expression is a list literal, we can deduce cardinality + // exactly, otherwise we approximate + int unwind_value; + if (auto literal = + dynamic_cast<query::ListLiteral *>(unwind.input_expression())) + unwind_value = literal->elements_.size(); + else + unwind_value = MiscParam::kUnwindNoLiteral; + + cardinality_ *= unwind_value; + return true; +} + +bool CostEstimator::Visit(Once &) { return true; } + +} // namespace query::plan diff --git a/src/query/plan/cost_estimator.hpp b/src/query/plan/cost_estimator.hpp new file mode 100644 index 000000000..f969bbe90 --- /dev/null +++ b/src/query/plan/cost_estimator.hpp @@ -0,0 +1,87 @@ +#include "logging/loggable.hpp" +#include "query/plan/operator.hpp" +#include "query/frontend/ast/ast.hpp" +#include "query/typed_value.hpp" + +namespace query::plan { + +/** + * @brief: Query plan execution time cost estimator, + * for comparing and choosing optimal execution plans. + * + * In Cypher the write part of the query always executes in + * the same cardinality. It is not allowed to execute a write + * operation before all the expansion for that query part + * (WITH splits a query into parts) have executed. + * + * Note that expansions and filtering can also happen during + * Merge, which is a write operation. We let that get evaluated + * like all other cardinality influencing ops. Also, Merge + * cardinality modification should be contained (it can never + * reduce it's input cardinality), but since Merge always happens + * after the read part, and can't be reoredered, we can ignore + * that. + * + * Limiting and accumulating (Aggregate, OrderBy, Accumulate) + * operations are cardinality + * modifiers that always execute at the end of the + * query part. Their cardinality influence is irrelevant + * because they generally execute the same for all plans + * for a single query part, and query part reordering is + * not allowed. + */ +class CostEstimator : public HierarchicalLogicalOperatorVisitor, Loggable { + public: + struct CostParam { + static constexpr double kScanAll{1.0}; + static constexpr double kScanAllByLabel{1.1}; + static constexpr double kExpand{2.0}; + static constexpr double kFilter{1.5}; + static constexpr double kExpandUniquenessFilter{1.5}; + static constexpr double kUnwind{1.3}; + }; + + struct CardParam { + static constexpr double kExpand{3.0}; + static constexpr double kFilter{0.25}; + static constexpr double kExpandUniquenessFilter{0.95}; + }; + + struct MiscParam { + static constexpr double kUnwindNoLiteral{10.0}; + }; + + using HierarchicalLogicalOperatorVisitor::PreVisit; + using HierarchicalLogicalOperatorVisitor::PostVisit; + + CostEstimator(const GraphDbAccessor &db_accessor) + : Loggable("QueryCostEstimator"), db_accessor_(db_accessor) {} + + bool PostVisit(ScanAll &) override; + bool PostVisit(ScanAllByLabel &scan_all_by_label) override; + bool PostVisit(Expand &) override; + bool PostVisit(Filter &) override; + bool PostVisit(ExpandUniquenessFilter<VertexAccessor> &) override; + bool PostVisit(ExpandUniquenessFilter<EdgeAccessor> &) override; + bool PostVisit(Unwind &unwind) override; + bool Visit(Once &) override; + + auto cost() const { return cost_; } + auto cardinality() const { return cardinality_; } + + private: + // cost estimation that gets accumulated as the visitor + // tours the logical plan + double cost_{0}; + + // cardinality estimation (how many times an operator gets executed) + // cardinality is a double to make it easier to work with + double cardinality_{1}; + // + // accessor used for cardinality estimates in ScanAll and ScanAllByLabel + const GraphDbAccessor &db_accessor_; + + void IncrementCost(double param) { cost_ += param * cardinality_; } +}; + +} // namespace query::plan diff --git a/src/query/plan/operator.hpp b/src/query/plan/operator.hpp index 7445bb3ed..feb7b958e 100644 --- a/src/query/plan/operator.hpp +++ b/src/query/plan/operator.hpp @@ -320,6 +320,8 @@ class ScanAllByLabel : public ScanAll { bool Accept(HierarchicalLogicalOperatorVisitor &visitor) override; std::unique_ptr<Cursor> MakeCursor(GraphDbAccessor &db) override; + GraphDbTypes::Label label() const { return label_; } + private: const GraphDbTypes::Label label_; }; @@ -1221,6 +1223,8 @@ class Unwind : public LogicalOperator { bool Accept(HierarchicalLogicalOperatorVisitor &visitor) override; std::unique_ptr<Cursor> MakeCursor(GraphDbAccessor &db) override; + Expression *input_expression() const { return input_expression_; } + private: const std::shared_ptr<LogicalOperator> input_; Expression *input_expression_; diff --git a/tests/manual/console_test.cpp b/tests/manual/console_test.cpp index 5b957cb6f..6714e9a1d 100644 --- a/tests/manual/console_test.cpp +++ b/tests/manual/console_test.cpp @@ -5,6 +5,9 @@ #include "query/interpreter.hpp" #include "utils/random_graph_generator.hpp" +#include "logging/default.hpp" +#include "logging/streams/stdout.hpp" + void random_generate(Dbms &dbms, uint node_count, int edge_factor = 5) { auto dba = dbms.active(); utils::RandomGraphGenerator generator(*dba); @@ -20,72 +23,23 @@ void random_generate(Dbms &dbms, uint node_count, int edge_factor = 5) { generator.Commit(); } -void fill_db(Dbms &dbms) { - auto dba = dbms.active(); - - // labels - auto company = dba->label("Company"); - auto person = dba->label("Person"); - auto device = dba->label("Device"); - - // props - auto name = dba->property("name"); - auto age = dba->property("age"); - auto type = dba->property("type"); - - // vertices - auto memgraph = dba->insert_vertex(); - memgraph.PropsSet(name, "Memgraph"); - memgraph.add_label(company); - auto teon = dba->insert_vertex(); - teon.PropsSet(name, "Teon"); - teon.PropsSet(age, 26); - teon.add_label(person); - auto mislav = dba->insert_vertex(); - mislav.PropsSet(name, "Mislav"); - mislav.PropsSet(age, 22); - mislav.add_label(person); - auto florijan = dba->insert_vertex(); - florijan.PropsSet(name, "Florijan"); - florijan.PropsSet(age, 31); - florijan.add_label(person); - auto xps_15 = dba->insert_vertex(); - xps_15.PropsSet(type, "PC"); - xps_15.PropsSet(name, "Dell XPS 15"); - xps_15.add_label(device); - - // edges - dba->insert_edge(teon, memgraph, dba->edge_type("MEMBER_OF")); - dba->insert_edge(mislav, memgraph, dba->edge_type("MEMBER_OF")); - dba->insert_edge(florijan, memgraph, dba->edge_type("MEMBER_OF")); - - dba->insert_edge(teon, mislav, dba->edge_type("FRIEND_OF")); - dba->insert_edge(mislav, teon, dba->edge_type("FRIEND_OF")); - dba->insert_edge(florijan, mislav, dba->edge_type("FRIEND_OF")); - dba->insert_edge(mislav, florijan, dba->edge_type("FRIEND_OF")); - dba->insert_edge(florijan, teon, dba->edge_type("FRIEND_OF")); - dba->insert_edge(teon, florijan, dba->edge_type("FRIEND_OF")); - - dba->insert_edge(memgraph, xps_15, dba->edge_type("OWNS")); - - dba->insert_edge(teon, xps_15, dba->edge_type("USES")); - dba->insert_edge(mislav, xps_15, dba->edge_type("USES")); - dba->insert_edge(florijan, xps_15, dba->edge_type("USES")); - - dba->commit(); -} - int main(int argc, char *argv[]) { REGISTER_ARGS(argc, argv); // parse the first cmd line argument as the count of nodes to randomly create - uint node_count = 100000; + uint node_count = 0; if (argc > 1) { node_count = (uint) std::stoul(argv[1]); permanent_assert(node_count < 10000000, "More then 10M nodes requested, that's too much"); } + // TODO switch to GFlags, once finally available + if (argc > 2) { + logging::init_sync(); + logging::log->pipe(std::make_unique<Stdout>()); + } + Dbms dbms; std::cout << "Generating graph..." << std::endl; // fill_db(dbms); diff --git a/tests/unit/query_cost_estimator.cpp b/tests/unit/query_cost_estimator.cpp new file mode 100644 index 000000000..c39de4ebe --- /dev/null +++ b/tests/unit/query_cost_estimator.cpp @@ -0,0 +1,127 @@ +#include <gtest/gtest.h> +#include <memory> + +#include "dbms/dbms.hpp" +#include "query/frontend/ast/ast.hpp" +#include "query/frontend/semantic/symbol_table.hpp" +#include "query/plan/cost_estimator.hpp" +#include "query/plan/operator.hpp" +#include "storage/vertex_accessor.hpp" + +using namespace query; +using namespace query::plan; + +using CardParam = CostEstimator::CardParam; +using CostParam = CostEstimator::CostParam; +using MiscParam = CostEstimator::MiscParam; + +/** A fixture for cost estimation. Sets up the database + * and accessor (adds some vertices). Provides convenience + * functions for creating the logical plan. Note that the + * resulting plan is NOT fit for execution, only for cost + * estimation testing. */ +class QueryCostEstimator : public ::testing::Test { + protected: + Dbms dbms; + std::unique_ptr<GraphDbAccessor> dba = dbms.active(); + + // we incrementally build the logical operator plan + // start it off with Once + std::shared_ptr<LogicalOperator> last_op_ = std::make_shared<Once>(); + + AstTreeStorage storage_; + SymbolTable symbol_table_; + int symbol_count = 0; + + Symbol NextSymbol() { + return symbol_table_.CreateSymbol("Symbol" + std::to_string(symbol_count++), + true); + } + + /** Adds the given number of vertices to the DB, which + * the given number is labeled with the given label */ + void AddVertices(int vertex_count, GraphDbTypes::Label label, + int labeled_count) { + for (int i = 0; i < vertex_count; i++) { + auto vertex = dba->insert_vertex(); + if (i < labeled_count) vertex.add_label(label); + } + + dba->advance_command(); + } + + auto Cost() { + CostEstimator cost_estimator(*dba); + last_op_->Accept(cost_estimator); + return cost_estimator.cost(); + } + + template <typename TLogicalOperator, typename... TArgs> + void MakeOp(TArgs... args) { + last_op_ = std::make_shared<TLogicalOperator>(args...); + } +}; + +// multiply with 1 to avoid linker error (possibly fixed in CLang >= 3.81) +#define EXPECT_COST(COST) EXPECT_FLOAT_EQ(Cost(), 1 * COST) + +TEST_F(QueryCostEstimator, Once) { EXPECT_COST(0); } + +TEST_F(QueryCostEstimator, ScanAll) { + AddVertices(100, dba->label("Label"), 30); + MakeOp<ScanAll>(last_op_, NextSymbol()); + EXPECT_COST(100 * CostParam::kScanAll); +} + +TEST_F(QueryCostEstimator, ScanAllByLabelCardinality) { + GraphDbTypes::Label label = dba->label("Label"); + AddVertices(100, label, 30); + MakeOp<ScanAllByLabel>(last_op_, NextSymbol(), label); + EXPECT_COST(30 * CostParam::kScanAllByLabel); +} + +TEST_F(QueryCostEstimator, ExpandCardinality) { + MakeOp<Expand>(NextSymbol(), NextSymbol(), EdgeAtom::Direction::IN, last_op_, + NextSymbol(), false, false); + EXPECT_COST(CardParam::kExpand * CostParam::kExpand); +} + +// helper for testing an operations cost and cardinality +// only for operations that first increment cost, then modify cardinality +// intentially a macro (instead of function) for better test feedback +#define TEST_OP(OP, OP_COST_PARAM, OP_CARD_PARAM) \ + OP; \ + EXPECT_COST(OP_COST_PARAM); \ + OP; \ + EXPECT_COST(OP_COST_PARAM + OP_CARD_PARAM * OP_COST_PARAM); + +TEST_F(QueryCostEstimator, Filter) { + TEST_OP(MakeOp<Filter>(last_op_, storage_.Create<PrimitiveLiteral>(true)), + CostParam::kFilter, CardParam::kFilter); +} + +TEST_F(QueryCostEstimator, ExpandUniquenessFilter) { + TEST_OP(MakeOp<ExpandUniquenessFilter<VertexAccessor>>(last_op_, NextSymbol(), + std::vector<Symbol>()), + CostParam::kExpandUniquenessFilter, + CardParam::kExpandUniquenessFilter); +} + +TEST_F(QueryCostEstimator, UnwindLiteral) { + TEST_OP(MakeOp<query::plan::Unwind>( + last_op_, storage_.Create<ListLiteral>( + std::vector<Expression *>(7, nullptr)), + NextSymbol()), + CostParam::kUnwind, 7); +} + +TEST_F(QueryCostEstimator, UnwindNoLiteral) { + TEST_OP(MakeOp<query::plan::Unwind>(last_op_, nullptr, NextSymbol()), + CostParam::kUnwind, MiscParam::kUnwindNoLiteral); +} + +#undef TEST_OP +#undef EXPECT_COST +// +// TODO test cost when ScanAll, Expand, Accumulate, Limit +// vs cost for SA, Expand, Limit