CostEstimator - adding new ScanAll and Expand ops

Summary: Now all ScanAll and Expand ops are covered by the cost estimator. For ScanAll with indices cost estimation is pretty good, for new Expand ops it is tragically bad (Expand to the power of expansion depth, plus arbitrary filtering). Static cost estimation is wrong wrong wrong. Currently cost estimation of even trivial plans that use indices is wrong because the planner leaves filtering expressions that are implicitly handled by the index in the operator tree, IIRC. Tasking Teon to revise this, even though I'm not sure how bad an influence this has on cost estimation and it's use in plan choosing. Reviewers: mislav.bradac, teon.banek, buda Reviewed By: mislav.bradac Subscribers: pullbot Differential Revision: https://phabricator.memgraph.io/D633
2017-08-03 14:18:19 +02:00 · 2017-08-03 14:18:19 +02:00 · bce4407ebe
commit bce4407ebe
parent 6c22caa80e
5 changed files with 242 additions and 48 deletions
--- a/src/query/plan/cost_estimator.cpp
+++ b/src/query/plan/cost_estimator.cpp
@ -1,3 +1,5 @@
+#include <experimental/optional>
+
 #include "cost_estimator.hpp"

 namespace query::plan {
@ -11,31 +13,112 @@ bool CostEstimator::PostVisit(ScanAll &) {

 bool CostEstimator::PostVisit(ScanAllByLabel &scan_all_by_label) {
  cardinality_ *= db_accessor_.vertices_count(scan_all_by_label.label());
-  // ScanAllByLabel performs some work for every element that is produced
+  // ScanAll performs some work for every element that is produced
  IncrementCost(CostParam::kScanAllByLabel);
  return true;
 }

-bool CostEstimator::PostVisit(Expand &) {
-  cardinality_ *= CardParam::kExpand;
-  // Expand performs some work for every expansion
-  IncrementCost(CostParam::kExpand);
+bool CostEstimator::PostVisit(ScanAllByLabelPropertyValue &logical_op) {
+  // this cardinality estimation depends on the property value (expression).
+  // if it's a literal (const) we can evaluate cardinality exactly, otherwise
+  // we estimate
+  std::experimental::optional<PropertyValue> property_value =
+      std::experimental::nullopt;
+  if (auto *literal = dynamic_cast<PrimitiveLiteral *>(logical_op.expression()))
+    if (literal->value_.IsPropertyValue())
+      property_value =
+          std::experimental::optional<PropertyValue>(literal->value_);
+
+  double factor = 1.0;
+  if (property_value)
+    // get the exact influence based on ScanAll(label, property, value)
+    factor = db_accessor_.vertices_count(
+        logical_op.label(), logical_op.property(), property_value.value());
+  else
+    // estimate the influence as ScanAll(label, property) * filtering
+    factor =
+        db_accessor_.vertices_count(logical_op.label(), logical_op.property()) *
+        CardParam::kFilter;
+
+  cardinality_ *= factor;
+
+  // ScanAll performs some work for every element that is produced
+  IncrementCost(CostParam::MakeScanAllByLabelPropertyValue);
  return true;
 }

-// for the given op first increments the cost and then cardinality
-#define POST_VISIT(LOGICAL_OP, PARAM_NAME)      \
-  bool CostEstimator::PostVisit(LOGICAL_OP &) { \
-    IncrementCost(CostParam::PARAM_NAME);       \
-    cardinality_ *= CardParam::PARAM_NAME;      \
-    return true;                                \
+namespace {
+// converts an optional ScanAll range bound into a property value
+// if the bound is present and is a literal expression convertible to
+// a property value. otherwise returns nullopt
+std::experimental::optional<utils::Bound<PropertyValue>> BoundToPropertyValue(
+    std::experimental::optional<ScanAllByLabelPropertyRange::Bound> bound) {
+  if (bound)
+    if (auto *literal = dynamic_cast<PrimitiveLiteral *>(bound->value()))
+      return std::experimental::make_optional(
+          utils::Bound<PropertyValue>(literal->value_, bound->type()));
+  return std::experimental::nullopt;
+}
+}
+
+bool CostEstimator::PostVisit(ScanAllByLabelPropertyRange &logical_op) {
+  // this cardinality estimation depends on Bound expressions.
+  // if they are literals we can evaluate cardinality properly
+  auto lower = BoundToPropertyValue(logical_op.lower_bound());
+  auto upper = BoundToPropertyValue(logical_op.upper_bound());
+
+  int64_t factor = 1;
+  if (upper || lower)
+    // if we have either Bound<PropertyValue>, use the value index
+    factor = db_accessor_.vertices_count(logical_op.label(),
+                                         logical_op.property(), lower, upper);
+  else
+    // no values, but we still have the label
+    factor =
+        db_accessor_.vertices_count(logical_op.label(), logical_op.property());
+
+  // if we failed to take either bound from the op into account, then apply
+  // the filtering constant to the factor
+  if ((logical_op.upper_bound() && !upper) ||
+      (logical_op.lower_bound() && !lower))
+    factor *= CardParam::kFilter;
+
+  cardinality_ *= factor;
+
+  // ScanAll performs some work for every element that is produced
+  IncrementCost(CostParam::MakeScanAllByLabelPropertyRange);
+  return true;
+}
+
+// For the given op first increments the cardinality and then cost.
+#define POST_VISIT_CARD_FIRST(NAME)       \
+  bool CostEstimator::PostVisit(NAME &) { \
+    cardinality_ *= CardParam::k##NAME;   \
+    IncrementCost(CostParam::k##NAME);    \
+    return true;                          \
  }

-POST_VISIT(Filter, kFilter)
-POST_VISIT(ExpandUniquenessFilter<VertexAccessor>, kExpandUniquenessFilter);
-POST_VISIT(ExpandUniquenessFilter<EdgeAccessor>, kExpandUniquenessFilter);
+POST_VISIT_CARD_FIRST(Expand);
+POST_VISIT_CARD_FIRST(ExpandVariable);
+POST_VISIT_CARD_FIRST(ExpandBreadthFirst);

-#undef POST_VISIT
+#undef POST_VISIT_CARD_FIRST
+
+// For the given op first increments the cost and then cardinality.
+#define POST_VISIT_COST_FIRST(LOGICAL_OP, PARAM_NAME) \
+  bool CostEstimator::PostVisit(LOGICAL_OP &) {       \
+    IncrementCost(CostParam::PARAM_NAME);             \
+    cardinality_ *= CardParam::PARAM_NAME;            \
+    return true;                                      \
+  }
+
+POST_VISIT_COST_FIRST(Filter, kFilter)
+POST_VISIT_COST_FIRST(ExpandUniquenessFilter<VertexAccessor>,
+                      kExpandUniquenessFilter);
+POST_VISIT_COST_FIRST(ExpandUniquenessFilter<EdgeAccessor>,
+                      kExpandUniquenessFilter);
+
+#undef POST_VISIT_COST_FIRST

 bool CostEstimator::PostVisit(Unwind &unwind) {
  // Unwind cost depends more on the number of lists that get unwound
--- a/src/query/plan/cost_estimator.hpp
+++ b/src/query/plan/cost_estimator.hpp
@ -5,36 +5,44 @@
 namespace query::plan {

 /**
- * @brief: Query plan execution time cost estimator,
- * for comparing and choosing optimal execution plans.
+ * Query plan execution time cost estimator, for comparing and choosing optimal
+ * execution plans.
 *
- * In Cypher the write part of the query always executes in
- * the same cardinality. It is not allowed to execute a write
- * operation before all the expansion for that query part
- * (WITH splits a query into parts) have executed.
+ * In Cypher the write part of the query always executes in the same
+ * cardinality. It is not allowed to execute a write operation before all the
+ * expansion for that query part (WITH splits a query into parts) have executed.
+ * For that reason cost estimation comes down to cardinality estimation for the
+ * read parts of the query, and their expansion. We want to compare different
+ * plans and try to figure out which has the optimal organization of scans,
+ * expansions and filters.
 *
- * Note that expansions and filtering can also happen during
- * Merge, which is a write operation. We let that get evaluated
- * like all other cardinality influencing ops. Also, Merge
- * cardinality modification should be contained (it can never
- * reduce it's input cardinality), but since Merge always happens
- * after the read part, and can't be reoredered, we can ignore
- * that.
+ * Note that expansions and filtering can also happen during Merge, which is a
+ * write operation. We let that get evaluated like all other cardinality
+ * influencing ops. Also, Merge cardinality modification should be contained (it
+ * can never reduce it's input cardinality), but since Merge always happens
+ * after the read part, and can't be reoredered, we can ignore that.
 *
- * Limiting and accumulating (Aggregate, OrderBy, Accumulate)
- * operations are cardinality
- * modifiers that always execute at the end of the
- * query part. Their cardinality influence is irrelevant
- * because they generally execute the same for all plans
- * for a single query part, and query part reordering is
- * not allowed.
+ * Limiting and accumulating (Aggregate, OrderBy, Accumulate) operations are
+ * cardinality modifiers that always execute at the end of the query part. Their
+ * cardinality influence is irrelevant because they execute the same
+ * for all plans for a single query part, and query part reordering is not
+ * allowed.
+ *
+ * This kind of cost estimation can only be used for comparing logical plans.
+ * It's aim is to estimate cost(A) to be less then cost(B) in every case where
+ * actual query execution for plan A is less then that of plan B. It can NOT be
+ * used to estimate how MUCH execution between A and B will differ.
 */
 class CostEstimator : public HierarchicalLogicalOperatorVisitor {
 public:
  struct CostParam {
    static constexpr double kScanAll{1.0};
    static constexpr double kScanAllByLabel{1.1};
+    static constexpr double MakeScanAllByLabelPropertyValue{1.1};
+    static constexpr double MakeScanAllByLabelPropertyRange{1.1};
    static constexpr double kExpand{2.0};
+    static constexpr double kExpandVariable{3.0};
+    static constexpr double kExpandBreadthFirst{5.0};
    static constexpr double kFilter{1.5};
    static constexpr double kExpandUniquenessFilter{1.5};
    static constexpr double kUnwind{1.3};
@ -42,6 +50,8 @@ class CostEstimator : public HierarchicalLogicalOperatorVisitor {

  struct CardParam {
    static constexpr double kExpand{3.0};
+    static constexpr double kExpandVariable{9.0};
+    static constexpr double kExpandBreadthFirst{8.0};
    static constexpr double kFilter{0.25};
    static constexpr double kExpandUniquenessFilter{0.95};
  };
@ -58,7 +68,11 @@ class CostEstimator : public HierarchicalLogicalOperatorVisitor {

  bool PostVisit(ScanAll &) override;
  bool PostVisit(ScanAllByLabel &scan_all_by_label) override;
+  bool PostVisit(ScanAllByLabelPropertyValue &logical_op) override;
+  bool PostVisit(ScanAllByLabelPropertyRange &logical_op) override;
  bool PostVisit(Expand &) override;
+  bool PostVisit(ExpandVariable &) override;
+  bool PostVisit(ExpandBreadthFirst &) override;
  bool PostVisit(Filter &) override;
  bool PostVisit(ExpandUniquenessFilter<VertexAccessor> &) override;
  bool PostVisit(ExpandUniquenessFilter<EdgeAccessor> &) override;
--- a/src/query/typed_value.cpp
+++ b/src/query/typed_value.cpp
@ -265,6 +265,20 @@ bool TypedValue::IsNumeric() const {
  return type() == TypedValue::Type::Int || type() == TypedValue::Type::Double;
 }

+bool TypedValue::IsPropertyValue() const {
+  switch (type()) {
+    case Type::Null:
+    case Type::Bool:
+    case Type::Int:
+    case Type::Double:
+    case Type::String:
+    case Type::List:
+      return true;
+    default:
+    return false;
+  }
+}
+
 std::ostream &operator<<(std::ostream &os, const TypedValue::Type type) {
  switch (type) {
    case TypedValue::Type::Null:
--- a/src/query/typed_value.hpp
+++ b/src/query/typed_value.hpp
@ -140,6 +140,10 @@ class TypedValue : public TotalOrdering<TypedValue, TypedValue, TypedValue> {
   * an integer or double */
  bool IsNumeric() const;

+  /** Convenience function for checking if this TypedValue can be converted into
+   * PropertyValue */
+  bool IsPropertyValue() const;
+
  friend std::ostream &operator<<(std::ostream &stream, const TypedValue &prop);

 private:
--- a/tests/unit/query_cost_estimator.cpp
+++ b/tests/unit/query_cost_estimator.cpp
@ -24,6 +24,8 @@ class QueryCostEstimator : public ::testing::Test {
 protected:
  Dbms dbms;
  std::unique_ptr<GraphDbAccessor> dba = dbms.active();
+  GraphDbTypes::Label label = dba->label("label");
+  GraphDbTypes::Property property = dba->property("property");

  // we incrementally build the logical operator plan
  // start it off with Once
@ -33,18 +35,26 @@ class QueryCostEstimator : public ::testing::Test {
  SymbolTable symbol_table_;
  int symbol_count = 0;

+  void SetUp() {
+    // create the index in the current db accessor and then swap it to a new one
+    dba->BuildIndex(label, property);
+    auto new_dba = dbms.active();
+    dba.swap(new_dba);
+  }
+
  Symbol NextSymbol() {
    return symbol_table_.CreateSymbol("Symbol" + std::to_string(symbol_count++),
                                      true);
  }

-  /** Adds the given number of vertices to the DB, which
-   * the given number is labeled with the given label */
-  void AddVertices(int vertex_count, GraphDbTypes::Label label,
-                   int labeled_count) {
+  /** Adds the given number of vertices to the DB, of which
+   * the given numbers are labeled and have a property set. */
+  void AddVertices(int vertex_count, int labeled_count,
+                   int property_count = 0) {
    for (int i = 0; i < vertex_count; i++) {
      auto vertex = dba->insert_vertex();
      if (i < labeled_count) vertex.add_label(label);
+      if (i < property_count) vertex.PropsSet(property, i);
    }

    dba->advance_command();
@ -60,6 +70,18 @@ class QueryCostEstimator : public ::testing::Test {
  void MakeOp(TArgs... args) {
    last_op_ = std::make_shared<TLogicalOperator>(args...);
  }
+
+  template <typename TValue>
+  Expression *Literal(TValue value) {
+    return storage_.Create<PrimitiveLiteral>(value);
+  }
+
+  auto InclusiveBound(int bound) {
+    return std::experimental::make_optional(
+        utils::MakeBoundInclusive(Literal(bound)));
+  };
+
+  const std::experimental::nullopt_t nullopt = std::experimental::nullopt;
 };

 // multiply with 1 to avoid linker error (possibly fixed in CLang >= 3.81)
@ -68,27 +90,84 @@ class QueryCostEstimator : public ::testing::Test {
 TEST_F(QueryCostEstimator, Once) { EXPECT_COST(0); }

 TEST_F(QueryCostEstimator, ScanAll) {
-  AddVertices(100, dba->label("Label"), 30);
+  AddVertices(100, 30, 20);
  MakeOp<ScanAll>(last_op_, NextSymbol());
  EXPECT_COST(100 * CostParam::kScanAll);
 }

 TEST_F(QueryCostEstimator, ScanAllByLabelCardinality) {
-  GraphDbTypes::Label label = dba->label("Label");
-  AddVertices(100, label, 30);
+  AddVertices(100, 30, 20);
  MakeOp<ScanAllByLabel>(last_op_, NextSymbol(), label);
  EXPECT_COST(30 * CostParam::kScanAllByLabel);
 }

-TEST_F(QueryCostEstimator, ExpandCardinality) {
+TEST_F(QueryCostEstimator, ScanAllByLabelPropertyValueLiteral) {
+  AddVertices(100, 30, 20);
+  MakeOp<ScanAllByLabelPropertyValue>(last_op_, NextSymbol(), label, property,
+                                      Literal(12));
+  EXPECT_COST(1 * CostParam::MakeScanAllByLabelPropertyValue);
+}
+
+TEST_F(QueryCostEstimator, ScanAllByLabelPropertyValueExpr) {
+  AddVertices(100, 30, 20);
+  MakeOp<ScanAllByLabelPropertyValue>(
+      last_op_, NextSymbol(), label, property,
+      // once we make expression const-folding this test case will fail
+      storage_.Create<UnaryPlusOperator>(Literal(12)));
+  EXPECT_COST(20 * CardParam::kFilter *
+              CostParam::MakeScanAllByLabelPropertyValue);
+}
+
+TEST_F(QueryCostEstimator, ScanAllByLabelPropertyRangeUpper) {
+  AddVertices(100, 30, 20);
+  MakeOp<ScanAllByLabelPropertyRange>(last_op_, NextSymbol(), label, property,
+                                      nullopt, InclusiveBound(12));
+  // cardinality estimation is exact for very small indexes
+  EXPECT_COST(13 * CostParam::MakeScanAllByLabelPropertyRange);
+}
+
+TEST_F(QueryCostEstimator, ScanAllByLabelPropertyRangeLower) {
+  AddVertices(100, 30, 20);
+  MakeOp<ScanAllByLabelPropertyRange>(last_op_, NextSymbol(), label, property,
+                                      InclusiveBound(17), nullopt);
+  // cardinality estimation is exact for very small indexes
+  EXPECT_COST(3 * CostParam::MakeScanAllByLabelPropertyRange);
+}
+
+TEST_F(QueryCostEstimator, ScanAllByLabelPropertyRangeNonLiteral) {
+  AddVertices(100, 30, 20);
+  auto bound = std::experimental::make_optional(
+      utils::MakeBoundInclusive(static_cast<Expression *>(
+          storage_.Create<UnaryPlusOperator>(Literal(12)))));
+  MakeOp<ScanAllByLabelPropertyRange>(last_op_, NextSymbol(), label, property,
+                                      bound, nullopt);
+  EXPECT_COST(20 * CardParam::kFilter *
+              CostParam::MakeScanAllByLabelPropertyRange);
+}
+
+TEST_F(QueryCostEstimator, Expand) {
  MakeOp<Expand>(NextSymbol(), NextSymbol(), EdgeAtom::Direction::IN, last_op_,
                 NextSymbol(), false, false);
  EXPECT_COST(CardParam::kExpand * CostParam::kExpand);
 }

-// helper for testing an operations cost and cardinality
-// only for operations that first increment cost, then modify cardinality
-// intentially a macro (instead of function) for better test feedback
+TEST_F(QueryCostEstimator, ExpandVariable) {
+  MakeOp<ExpandVariable>(NextSymbol(), NextSymbol(), EdgeAtom::Direction::IN,
+                         nullptr, nullptr, last_op_, NextSymbol(), false,
+                         false);
+  EXPECT_COST(CardParam::kExpandVariable * CostParam::kExpandVariable);
+}
+
+TEST_F(QueryCostEstimator, ExpandBreadthFirst) {
+  MakeOp<ExpandBreadthFirst>(
+      NextSymbol(), NextSymbol(), EdgeAtom::Direction::IN, Literal(3),
+      NextSymbol(), NextSymbol(), Literal(true), last_op_, NextSymbol(), false);
+  EXPECT_COST(CardParam::kExpandBreadthFirst * CostParam::kExpandBreadthFirst);
+}
+
+// Helper for testing an operations cost and cardinality.
+// Only for operations that first increment cost, then modify cardinality.
+// Intentially a macro (instead of function) for better test feedback.
 #define TEST_OP(OP, OP_COST_PARAM, OP_CARD_PARAM) \
  OP;                                             \
  EXPECT_COST(OP_COST_PARAM);                     \
@ -96,8 +175,8 @@ TEST_F(QueryCostEstimator, ExpandCardinality) {
  EXPECT_COST(OP_COST_PARAM + OP_CARD_PARAM * OP_COST_PARAM);

 TEST_F(QueryCostEstimator, Filter) {
-  TEST_OP(MakeOp<Filter>(last_op_, storage_.Create<PrimitiveLiteral>(true)),
-          CostParam::kFilter, CardParam::kFilter);
+  TEST_OP(MakeOp<Filter>(last_op_, Literal(true)), CostParam::kFilter,
+          CardParam::kFilter);
 }

 TEST_F(QueryCostEstimator, ExpandUniquenessFilter) {