From e461a0834001d38021065a11dd7886e72891766c Mon Sep 17 00:00:00 2001 From: Teon Banek <teon.banek@memgraph.io> Date: Tue, 5 Feb 2019 16:12:02 +0100 Subject: [PATCH] Move creating indexed lookup to a rewrite pass Summary: RuleBasedPlanner now generates only the regular ScanAll operations, and Filter operations are appended as soon as possible. The newly added Rewrite step, takes this operator tree and replaces viable Filter & ScanAll operators with appropriate ScanAllBy<Index> operator. This change ought to simplify the behaviour of DistributedPlanner when that stage is moved before the indexed lookup rewrite. Showing unoptimized plan in interactive planner is also supported. Reviewers: mtomic, llugovic Reviewed By: mtomic Subscribers: pullbot Differential Revision: https://phabricator.memgraph.io/D1839 --- src/CMakeLists.txt | 3 + src/query/distributed_interpreter.cpp | 3 +- src/query/plan/planner.hpp | 4 +- src/query/plan/preprocess.cpp | 17 +- src/query/plan/preprocess.hpp | 27 +- src/query/plan/rewrite/index_lookup.cpp | 43 ++ src/query/plan/rewrite/index_lookup.hpp | 560 +++++++++++++++++++++ src/query/plan/rule_based_planner.cpp | 8 - src/query/plan/rule_based_planner.hpp | 150 +----- tests/manual/distributed_query_planner.cpp | 2 +- tests/manual/interactive_planning.cpp | 55 +- tests/manual/interactive_planning.hpp | 16 +- tests/unit/distributed_query_plan.cpp | 10 +- tests/unit/query_plan.cpp | 3 + 14 files changed, 709 insertions(+), 192 deletions(-) create mode 100644 src/query/plan/rewrite/index_lookup.cpp create mode 100644 src/query/plan/rewrite/index_lookup.hpp diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 9c129bc8b..c35ef2da4 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -38,6 +38,7 @@ set(mg_single_node_sources query/plan/preprocess.cpp query/plan/pretty_print.cpp query/plan/profile.cpp + query/plan/rewrite/index_lookup.cpp query/plan/rule_based_planner.cpp query/plan/variable_start_planner.cpp query/repl.cpp @@ -144,6 +145,7 @@ set(mg_distributed_sources query/plan/preprocess.cpp query/plan/pretty_print.cpp query/plan/profile.cpp + query/plan/rewrite/index_lookup.cpp query/plan/rule_based_planner.cpp query/plan/variable_start_planner.cpp query/repl.cpp @@ -274,6 +276,7 @@ set(mg_single_node_ha_sources query/plan/preprocess.cpp query/plan/pretty_print.cpp query/plan/profile.cpp + query/plan/rewrite/index_lookup.cpp query/plan/rule_based_planner.cpp query/plan/variable_start_planner.cpp query/repl.cpp diff --git a/src/query/distributed_interpreter.cpp b/src/query/distributed_interpreter.cpp index 77d4d1f2d..2608882a6 100644 --- a/src/query/distributed_interpreter.cpp +++ b/src/query/distributed_interpreter.cpp @@ -73,7 +73,8 @@ class DistributedPostProcessor final { template <class TPlanningContext> plan::DistributedPlan Rewrite(std::unique_ptr<plan::LogicalOperator> plan, TPlanningContext *context) { - original_plan_ = std::move(plan); + plan::PostProcessor post_processor(parameters_); + original_plan_ = post_processor.Rewrite(std::move(plan), context); const auto &property_names = context->ast_storage->properties_; std::vector<storage::Property> properties_by_ix; properties_by_ix.reserve(property_names.size()); diff --git a/src/query/plan/planner.hpp b/src/query/plan/planner.hpp index 1d85299be..8d934cb02 100644 --- a/src/query/plan/planner.hpp +++ b/src/query/plan/planner.hpp @@ -10,6 +10,7 @@ #include "query/plan/operator.hpp" #include "query/plan/preprocess.hpp" #include "query/plan/pretty_print.hpp" +#include "query/plan/rewrite/index_lookup.hpp" #include "query/plan/rule_based_planner.hpp" #include "query/plan/variable_start_planner.hpp" #include "query/plan/vertex_count_cache.hpp" @@ -33,7 +34,8 @@ class PostProcessor final { template <class TPlanningContext> std::unique_ptr<LogicalOperator> Rewrite( std::unique_ptr<LogicalOperator> plan, TPlanningContext *context) { - return plan; + return RewriteWithIndexLookup(std::move(plan), *context->symbol_table, + context->db); } template <class TVertexCounts> diff --git a/src/query/plan/preprocess.cpp b/src/query/plan/preprocess.cpp index d1cd0da99..e661a5925 100644 --- a/src/query/plan/preprocess.cpp +++ b/src/query/plan/preprocess.cpp @@ -188,7 +188,8 @@ void Filters::EraseFilter(const FilterInfo &filter) { all_filters_.end()); } -void Filters::EraseLabelFilter(const Symbol &symbol, LabelIx label) { +void Filters::EraseLabelFilter(const Symbol &symbol, LabelIx label, + std::vector<Expression *> *removed_filters) { for (auto filter_it = all_filters_.begin(); filter_it != all_filters_.end();) { if (filter_it->type != FilterInfo::Type::Label) { @@ -210,6 +211,9 @@ void Filters::EraseLabelFilter(const Symbol &symbol, LabelIx label) { << "Didn't expect duplicated labels"; if (filter_it->labels.empty()) { // If there are no labels to filter, then erase the whole FilterInfo. + if (removed_filters) { + removed_filters->push_back(filter_it->expression); + } filter_it = all_filters_.erase(filter_it); } else { ++filter_it; @@ -315,8 +319,15 @@ void Filters::CollectPatternFilters(Pattern &pattern, SymbolTable &symbol_table, // information for potential property and label indexing. void Filters::CollectWhereFilter(Where &where, const SymbolTable &symbol_table) { - auto where_filters = SplitExpressionOnAnd(where.expression_); - for (const auto &filter : where_filters) { + CollectFilterExpression(where.expression_, symbol_table); +} + +// Adds the expression to `all_filters_` and collects additional +// information for potential property and label indexing. +void Filters::CollectFilterExpression(Expression *expr, + const SymbolTable &symbol_table) { + auto filters = SplitExpressionOnAnd(expr); + for (const auto &filter : filters) { AnalyzeAndStoreFilter(filter, symbol_table); } } diff --git a/src/query/plan/preprocess.hpp b/src/query/plan/preprocess.hpp index b7f9262a1..5b852f77d 100644 --- a/src/query/plan/preprocess.hpp +++ b/src/query/plan/preprocess.hpp @@ -121,7 +121,7 @@ struct FilterInfo { /// /// Info is stored as a list of FilterInfo objects corresponding to all filter /// expressions that should be generated. -class Filters { +class Filters final { public: using iterator = std::vector<FilterInfo>::iterator; using const_iterator = std::vector<FilterInfo>::const_iterator; @@ -147,7 +147,7 @@ class Filters { for (const auto &filter : all_filters_) { if (filter.type == FilterInfo::Type::Label && utils::Contains(filter.used_symbols, symbol)) { - DCHECK(filter.used_symbols.size() == 1U) + CHECK(filter.used_symbols.size() == 1U) << "Expected a single used symbol for label filter"; labels.insert(filter.labels.begin(), filter.labels.end()); } @@ -155,15 +155,18 @@ class Filters { return labels; } - // Remove a filter; may invalidate iterators. - // Removal is done by comparing only the expression, so that multiple - // FilterInfo objects using the same original expression are removed. + /// Remove a filter; may invalidate iterators. + /// Removal is done by comparing only the expression, so that multiple + /// FilterInfo objects using the same original expression are removed. void EraseFilter(const FilterInfo &); - // Remove a label filter for symbol; may invalidate iterators. - void EraseLabelFilter(const Symbol &, LabelIx); + /// Remove a label filter for symbol; may invalidate iterators. + /// If removed_filters is not nullptr, fills the vector with original + /// `Expression *` which are now completely removed. + void EraseLabelFilter(const Symbol &, LabelIx, + std::vector<Expression *> *removed_filters = nullptr); - // Returns a vector of FilterInfo for properties. + /// Returns a vector of FilterInfo for properties. auto PropertyFilters(const Symbol &symbol) const { std::vector<FilterInfo> filters; for (const auto &filter : all_filters_) { @@ -181,6 +184,7 @@ class Filters { /// for found labels, properties and edge types. The generated expressions are /// stored. void CollectPatternFilters(Pattern &, SymbolTable &, AstStorage &); + /// Collects filtering information from a where expression. /// /// Takes the where expression and stores it, then analyzes the expression for @@ -188,6 +192,13 @@ class Filters { /// label filters and property filters, so that indexed scanning can use it. void CollectWhereFilter(Where &, const SymbolTable &); + /// Collects filtering information from an expression. + /// + /// Takes the where expression and stores it, then analyzes the expression for + /// additional information. The additional information is used to populate + /// label filters and property filters, so that indexed scanning can use it. + void CollectFilterExpression(Expression *, const SymbolTable &); + private: void AnalyzeAndStoreFilter(Expression *, const SymbolTable &); diff --git a/src/query/plan/rewrite/index_lookup.cpp b/src/query/plan/rewrite/index_lookup.cpp new file mode 100644 index 000000000..fda80db2a --- /dev/null +++ b/src/query/plan/rewrite/index_lookup.cpp @@ -0,0 +1,43 @@ +#include "query/plan/rewrite/index_lookup.hpp" + +#include "utils/flag_validation.hpp" + +DEFINE_VALIDATED_HIDDEN_int64( + query_vertex_count_to_expand_existing, 10, + "Maximum count of indexed vertices which provoke " + "indexed lookup and then expand to existing, instead of " + "a regular expand. Default is 10, to turn off use -1.", + FLAG_IN_RANGE(-1, std::numeric_limits<std::int64_t>::max())); + +namespace query::plan::impl { + +Expression *RemoveAndExpressions( + Expression *expr, const std::unordered_set<Expression *> &exprs_to_remove) { + auto *and_op = utils::Downcast<AndOperator>(expr); + if (!and_op) return expr; + if (utils::Contains(exprs_to_remove, and_op)) { + return nullptr; + } + if (utils::Contains(exprs_to_remove, and_op->expression1_)) { + and_op->expression1_ = nullptr; + } + if (utils::Contains(exprs_to_remove, and_op->expression2_)) { + and_op->expression2_ = nullptr; + } + and_op->expression1_ = + RemoveAndExpressions(and_op->expression1_, exprs_to_remove); + and_op->expression2_ = + RemoveAndExpressions(and_op->expression2_, exprs_to_remove); + if (!and_op->expression1_ && !and_op->expression2_) { + return nullptr; + } + if (and_op->expression1_ && !and_op->expression2_) { + return and_op->expression1_; + } + if (and_op->expression2_ && !and_op->expression1_) { + return and_op->expression2_; + } + return and_op; +} + +} // namespace query::plan::impl diff --git a/src/query/plan/rewrite/index_lookup.hpp b/src/query/plan/rewrite/index_lookup.hpp new file mode 100644 index 000000000..78b4c96e4 --- /dev/null +++ b/src/query/plan/rewrite/index_lookup.hpp @@ -0,0 +1,560 @@ +/// @file +/// This file provides a plan rewriter which replaces `Filter` and `ScanAll` +/// operations with `ScanAllBy<Index>` if possible. The public entrypoint is +/// `RewriteWithIndexLookup`. + +#pragma once + +#include <algorithm> +#include <experimental/optional> +#include <memory> +#include <unordered_map> +#include <unordered_set> +#include <vector> + +#include <gflags/gflags.h> + +#include "query/plan/operator.hpp" +#include "query/plan/preprocess.hpp" + +DECLARE_int64(query_vertex_count_to_expand_existing); + +namespace query::plan { + +namespace impl { + +// Return the new root expression after removing the given expressions from the +// given expression tree. +Expression *RemoveAndExpressions( + Expression *expr, const std::unordered_set<Expression *> &exprs_to_remove); + +template <class TDbAccessor> +class IndexLookupRewriter final : public HierarchicalLogicalOperatorVisitor { + public: + IndexLookupRewriter(const SymbolTable *symbol_table, TDbAccessor *db) + : symbol_table_(symbol_table), db_(db) {} + + using HierarchicalLogicalOperatorVisitor::PostVisit; + using HierarchicalLogicalOperatorVisitor::PreVisit; + using HierarchicalLogicalOperatorVisitor::Visit; + + bool Visit(Once &) override { return true; } + + bool PreVisit(Filter &op) override { + prev_ops_.push_back(&op); + filters_.CollectFilterExpression(op.expression_, *symbol_table_); + return true; + } + + // Remove no longer needed Filter in PostVisit, this should be the last thing + // Filter::Accept does, so it should be safe to remove the last reference and + // free the memory. + bool PostVisit(Filter &op) override { + prev_ops_.pop_back(); + op.expression_ = + RemoveAndExpressions(op.expression_, filter_exprs_for_removal_); + if (!op.expression_ || + utils::Contains(filter_exprs_for_removal_, op.expression_)) { + SetOnParent(op.input()); + } + return true; + } + + bool PreVisit(ScanAll &op) override { + prev_ops_.push_back(&op); + return true; + } + + // Replace ScanAll with ScanAllBy<Index> in PostVisit, because removal of + // ScanAll may remove the last reference and thus free the memory. PostVisit + // should be the last thing ScanAll::Accept does, so it should be safe. + bool PostVisit(ScanAll &scan) override { + prev_ops_.pop_back(); + auto indexed_scan = GenScanByIndex(scan); + if (indexed_scan) { + SetOnParent(std::move(indexed_scan)); + } + return true; + } + + bool PreVisit(Expand &op) override { + prev_ops_.push_back(&op); + return true; + } + + // See if it might be better to do ScanAllBy<Index> of the destination and + // then do Expand to existing. + bool PostVisit(Expand &expand) override { + prev_ops_.pop_back(); + if (expand.common_.existing_node) { + return true; + } + ScanAll dst_scan(expand.input(), expand.common_.node_symbol, + expand.graph_view_); + auto indexed_scan = + GenScanByIndex(dst_scan, FLAGS_query_vertex_count_to_expand_existing); + if (indexed_scan) { + expand.set_input(std::move(indexed_scan)); + expand.common_.existing_node = true; + } + return true; + } + + // The following operators may only use index lookup in filters inside of + // their own branches. So we handle them all the same. + // * Input operator is visited with the current visitor. + // * Custom operator branches are visited with a new visitor. + + bool PreVisit(Merge &op) override { + prev_ops_.push_back(&op); + op.input()->Accept(*this); + RewriteBranch(&op.merge_match_); + return false; + } + + bool PostVisit(Merge &) override { + prev_ops_.pop_back(); + return true; + } + + bool PreVisit(Optional &op) override { + prev_ops_.push_back(&op); + op.input()->Accept(*this); + RewriteBranch(&op.optional_); + return false; + } + + bool PostVisit(Optional &) override { + prev_ops_.pop_back(); + return true; + } + + // Rewriting Cartesian assumes that the input plan will have Filter operations + // as soon as they are possible. Therefore we do not track filters above + // Cartesian because they should be irrelevant. + // + // For example, the following plan is not expected to be an input to + // IndexLookupRewriter. + // + // Filter n.prop = 16 + // | + // Cartesian + // | + // |\ + // | ScanAll (n) + // | + // ScanAll (m) + // + // Instead, the equivalent set of operations should be done this way: + // + // Cartesian + // | + // |\ + // | Filter n.prop = 16 + // | | + // | ScanAll (n) + // | + // ScanAll (m) + bool PreVisit(Cartesian &op) override { + prev_ops_.push_back(&op); + RewriteBranch(&op.left_op_); + RewriteBranch(&op.right_op_); + return false; + } + + bool PostVisit(Cartesian &) override { + prev_ops_.pop_back(); + return true; + } + + bool PreVisit(Union &op) override { + prev_ops_.push_back(&op); + RewriteBranch(&op.left_op_); + RewriteBranch(&op.right_op_); + return false; + } + + bool PostVisit(Union &) override { + prev_ops_.pop_back(); + return true; + } + + // The remaining operators should work by just traversing into their input. + + bool PreVisit(CreateNode &op) override { + prev_ops_.push_back(&op); + return true; + } + bool PostVisit(CreateNode &) override { + prev_ops_.pop_back(); + return true; + } + + bool PreVisit(CreateExpand &op) override { + prev_ops_.push_back(&op); + return true; + } + bool PostVisit(CreateExpand &) override { + prev_ops_.pop_back(); + return true; + } + + bool PreVisit(ScanAllByLabel &op) override { + prev_ops_.push_back(&op); + return true; + } + bool PostVisit(ScanAllByLabel &) override { + prev_ops_.pop_back(); + return true; + } + + bool PreVisit(ScanAllByLabelPropertyRange &op) override { + prev_ops_.push_back(&op); + return true; + } + bool PostVisit(ScanAllByLabelPropertyRange &) override { + prev_ops_.pop_back(); + return true; + } + + bool PreVisit(ScanAllByLabelPropertyValue &op) override { + prev_ops_.push_back(&op); + return true; + } + bool PostVisit(ScanAllByLabelPropertyValue &) override { + prev_ops_.pop_back(); + return true; + } + + bool PreVisit(ExpandVariable &op) override { + prev_ops_.push_back(&op); + return true; + } + bool PostVisit(ExpandVariable &) override { + prev_ops_.pop_back(); + return true; + } + + bool PreVisit(ConstructNamedPath &op) override { + prev_ops_.push_back(&op); + return true; + } + bool PostVisit(ConstructNamedPath &) override { + prev_ops_.pop_back(); + return true; + } + + bool PreVisit(Produce &op) override { + prev_ops_.push_back(&op); + return true; + } + bool PostVisit(Produce &) override { + prev_ops_.pop_back(); + return true; + } + + bool PreVisit(Delete &op) override { + prev_ops_.push_back(&op); + return true; + } + bool PostVisit(Delete &) override { + prev_ops_.pop_back(); + return true; + } + + bool PreVisit(SetProperty &op) override { + prev_ops_.push_back(&op); + return true; + } + bool PostVisit(SetProperty &) override { + prev_ops_.pop_back(); + return true; + } + + bool PreVisit(SetProperties &op) override { + prev_ops_.push_back(&op); + return true; + } + bool PostVisit(SetProperties &) override { + prev_ops_.pop_back(); + return true; + } + + bool PreVisit(SetLabels &op) override { + prev_ops_.push_back(&op); + return true; + } + bool PostVisit(SetLabels &) override { + prev_ops_.pop_back(); + return true; + } + + bool PreVisit(RemoveProperty &op) override { + prev_ops_.push_back(&op); + return true; + } + bool PostVisit(RemoveProperty &) override { + prev_ops_.pop_back(); + return true; + } + + bool PreVisit(RemoveLabels &op) override { + prev_ops_.push_back(&op); + return true; + } + bool PostVisit(RemoveLabels &) override { + prev_ops_.pop_back(); + return true; + } + + bool PreVisit(EdgeUniquenessFilter &op) override { + prev_ops_.push_back(&op); + return true; + } + bool PostVisit(EdgeUniquenessFilter &) override { + prev_ops_.pop_back(); + return true; + } + + bool PreVisit(Accumulate &op) override { + prev_ops_.push_back(&op); + return true; + } + bool PostVisit(Accumulate &) override { + prev_ops_.pop_back(); + return true; + } + + bool PreVisit(Aggregate &op) override { + prev_ops_.push_back(&op); + return true; + } + bool PostVisit(Aggregate &) override { + prev_ops_.pop_back(); + return true; + } + + bool PreVisit(Skip &op) override { + prev_ops_.push_back(&op); + return true; + } + bool PostVisit(Skip &) override { + prev_ops_.pop_back(); + return true; + } + + bool PreVisit(Limit &op) override { + prev_ops_.push_back(&op); + return true; + } + bool PostVisit(Limit &) override { + prev_ops_.pop_back(); + return true; + } + + bool PreVisit(OrderBy &op) override { + prev_ops_.push_back(&op); + return true; + } + bool PostVisit(OrderBy &) override { + prev_ops_.pop_back(); + return true; + } + + bool PreVisit(Unwind &op) override { + prev_ops_.push_back(&op); + return true; + } + bool PostVisit(Unwind &) override { + prev_ops_.pop_back(); + return true; + } + + bool PreVisit(Distinct &op) override { + prev_ops_.push_back(&op); + return true; + } + bool PostVisit(Distinct &) override { + prev_ops_.pop_back(); + return true; + } + + std::shared_ptr<LogicalOperator> new_root_; + + private: + const SymbolTable *symbol_table_; + TDbAccessor *db_; + Filters filters_; + std::unordered_set<Expression *> filter_exprs_for_removal_; + std::vector<LogicalOperator *> prev_ops_; + + struct LabelPropertyIndex { + LabelIx label; + // FilterInfo with PropertyFilter. + FilterInfo filter; + int64_t vertex_count; + }; + + bool DefaultPreVisit() override { + throw utils::NotYetImplemented("optimizing index lookup"); + } + + void SetOnParent(const std::shared_ptr<LogicalOperator> &input) { + CHECK(input); + if (prev_ops_.empty()) { + CHECK(!new_root_); + new_root_ = input; + return; + } + prev_ops_.back()->set_input(input); + } + + void RewriteBranch(std::shared_ptr<LogicalOperator> *branch) { + IndexLookupRewriter<TDbAccessor> rewriter(symbol_table_, db_); + (*branch)->Accept(rewriter); + if (rewriter.new_root_) { + *branch = rewriter.new_root_; + } + } + + storage::Label GetLabel(LabelIx label) { return db_->Label(label.name); } + + storage::Property GetProperty(PropertyIx prop) { + return db_->Property(prop.name); + } + + LabelIx FindBestLabelIndex(const std::unordered_set<LabelIx> &labels) { + CHECK(!labels.empty()) + << "Trying to find the best label without any labels."; + return *std::min_element(labels.begin(), labels.end(), + [this](const auto &label1, const auto &label2) { + return db_->VerticesCount(GetLabel(label1)) < + db_->VerticesCount(GetLabel(label2)); + }); + } + + // Finds the label-property combination which has indexed the lowest amount of + // vertices. If the index cannot be found, nullopt is returned. + std::experimental::optional<LabelPropertyIndex> FindBestLabelPropertyIndex( + const Symbol &symbol, const std::unordered_set<Symbol> &bound_symbols) { + auto are_bound = [&bound_symbols](const auto &used_symbols) { + for (const auto &used_symbol : used_symbols) { + if (!utils::Contains(bound_symbols, used_symbol)) { + return false; + } + } + return true; + }; + std::experimental::optional<LabelPropertyIndex> found; + for (const auto &label : filters_.FilteredLabels(symbol)) { + for (const auto &filter : filters_.PropertyFilters(symbol)) { + const auto &property = filter.property_filter->property_; + if (db_->LabelPropertyIndexExists(GetLabel(label), + GetProperty(property))) { + int64_t vertex_count = + db_->VerticesCount(GetLabel(label), GetProperty(property)); + if (!found || vertex_count < found->vertex_count) { + if (filter.property_filter->is_symbol_in_value_) { + // Skip filter expressions which use the symbol whose property + // we are looking up. We cannot scan by such expressions. For + // example, in `n.a = 2 + n.b` both sides of `=` refer to `n`, + // so we cannot scan `n` by property index. + continue; + } + if (are_bound(filter.used_symbols)) { + // Take the property filter which uses bound symbols. + found = LabelPropertyIndex{label, filter, vertex_count}; + } + } + } + } + } + return found; + } + + // Creates a ScanAll by the best possible index for the `node_symbol`. Best + // index is defined as the index with least number of vertices. If the node + // does not have at least a label, no indexed lookup can be created and + // `nullptr` is returned. The operator is chained after `input`. Optional + // `max_vertex_count` controls, whether no operator should be created if the + // vertex count in the best index exceeds this number. In such a case, + // `nullptr` is returned and `input` is not chained. + std::unique_ptr<ScanAll> GenScanByIndex( + const ScanAll &scan, + const std::experimental::optional<int64_t> &max_vertex_count = + std::experimental::nullopt) { + const auto &input = scan.input(); + const auto &node_symbol = scan.output_symbol_; + const auto &graph_view = scan.graph_view_; + const auto labels = filters_.FilteredLabels(node_symbol); + if (labels.empty()) { + // Without labels, we cannot generate any indexed ScanAll. + return nullptr; + } + // First, try to see if we can use label+property index. If not, use just + // the label index (which ought to exist). + const auto &modified_symbols = scan.ModifiedSymbols(*symbol_table_); + std::unordered_set<Symbol> bound_symbols(modified_symbols.begin(), + modified_symbols.end()); + auto found_index = FindBestLabelPropertyIndex(node_symbol, bound_symbols); + if (found_index && + // Use label+property index if we satisfy max_vertex_count. + (!max_vertex_count || *max_vertex_count >= found_index->vertex_count)) { + // Copy the property filter and then erase it from filters. + const auto prop_filter = *found_index->filter.property_filter; + filter_exprs_for_removal_.insert(found_index->filter.expression); + filters_.EraseFilter(found_index->filter); + std::vector<Expression *> removed_expressions; + filters_.EraseLabelFilter(node_symbol, found_index->label, + &removed_expressions); + filter_exprs_for_removal_.insert(removed_expressions.begin(), + removed_expressions.end()); + if (prop_filter.lower_bound_ || prop_filter.upper_bound_) { + return std::make_unique<ScanAllByLabelPropertyRange>( + input, node_symbol, GetLabel(found_index->label), + GetProperty(prop_filter.property_), prop_filter.property_.name, + prop_filter.lower_bound_, prop_filter.upper_bound_, graph_view); + } else { + CHECK(prop_filter.value_) << "Property filter should either have " + "bounds or a value expression."; + return std::make_unique<ScanAllByLabelPropertyValue>( + input, node_symbol, GetLabel(found_index->label), + GetProperty(prop_filter.property_), prop_filter.property_.name, + prop_filter.value_, graph_view); + } + } + auto label = FindBestLabelIndex(labels); + if (max_vertex_count && + db_->VerticesCount(GetLabel(label)) > *max_vertex_count) { + // Don't create an indexed lookup, since we have more labeled vertices + // than the allowed count. + return nullptr; + } + std::vector<Expression *> removed_expressions; + filters_.EraseLabelFilter(node_symbol, label, &removed_expressions); + filter_exprs_for_removal_.insert(removed_expressions.begin(), + removed_expressions.end()); + return std::make_unique<ScanAllByLabel>(input, node_symbol, GetLabel(label), + graph_view); + } +}; + +} // namespace impl + +template <class TDbAccessor> +std::unique_ptr<LogicalOperator> RewriteWithIndexLookup( + std::unique_ptr<LogicalOperator> root_op, const SymbolTable &symbol_table, + TDbAccessor *db) { + impl::IndexLookupRewriter<TDbAccessor> rewriter(&symbol_table, db); + root_op->Accept(rewriter); + if (rewriter.new_root_) { + // This shouldn't happen in real use case, because IndexLookupRewriter + // removes Filter operations and they cannot be the root op. In case we + // somehow missed this, raise NotYetImplemented instead of CHECK crashing + // the application. + throw utils::NotYetImplemented("optimizing index lookup"); + } + return root_op; +} + +} // namespace query::plan diff --git a/src/query/plan/rule_based_planner.cpp b/src/query/plan/rule_based_planner.cpp index bb29a1d75..447884a67 100644 --- a/src/query/plan/rule_based_planner.cpp +++ b/src/query/plan/rule_based_planner.cpp @@ -8,14 +8,6 @@ #include "utils/algorithm.hpp" #include "utils/exceptions.hpp" -#include "utils/flag_validation.hpp" - -DEFINE_VALIDATED_HIDDEN_int64( - query_vertex_count_to_expand_existing, 10, - "Maximum count of indexed vertices which provoke " - "indexed lookup and then expand to existing, instead of " - "a regular expand. Default is 10, to turn off use -1.", - FLAG_IN_RANGE(-1, std::numeric_limits<std::int64_t>::max())); namespace query::plan { diff --git a/src/query/plan/rule_based_planner.hpp b/src/query/plan/rule_based_planner.hpp index 563b69cfc..01ac2bdc2 100644 --- a/src/query/plan/rule_based_planner.hpp +++ b/src/query/plan/rule_based_planner.hpp @@ -9,8 +9,6 @@ #include "query/plan/operator.hpp" #include "query/plan/preprocess.hpp" -DECLARE_int64(query_vertex_count_to_expand_existing); - namespace query::plan { /// @brief Context which contains variables commonly used during planning. @@ -222,24 +220,16 @@ class RuleBasedPlanner { private: TPlanningContext *context_; - struct LabelPropertyIndex { - LabelIx label; - // FilterInfo with PropertyFilter. - FilterInfo filter; - int64_t vertex_count; - }; - storage::Label GetLabel(LabelIx label) { - return context_->db->Label(context_->ast_storage->labels_[label.ix]); + return context_->db->Label(label.name); } storage::Property GetProperty(PropertyIx prop) { - return context_->db->Property(context_->ast_storage->properties_[prop.ix]); + return context_->db->Property(prop.name); } storage::EdgeType GetEdgeType(EdgeTypeIx edge_type) { - return context_->db->EdgeType( - context_->ast_storage->edge_types_[edge_type.ix]); + return context_->db->EdgeType(edge_type.name); } std::unique_ptr<LogicalOperator> GenCreate( @@ -378,112 +368,6 @@ class RuleBasedPlanner { return nullptr; } - // Finds the label-property combination which has indexed the lowest amount of - // vertices. If the index cannot be found, nullopt is returned. - std::experimental::optional<LabelPropertyIndex> FindBestLabelPropertyIndex( - const Symbol &symbol, const Filters &filters, - const std::unordered_set<Symbol> &bound_symbols) { - auto are_bound = [&bound_symbols](const auto &used_symbols) { - for (const auto &used_symbol : used_symbols) { - if (!utils::Contains(bound_symbols, used_symbol)) { - return false; - } - } - return true; - }; - std::experimental::optional<LabelPropertyIndex> found; - for (const auto &label : filters.FilteredLabels(symbol)) { - for (const auto &filter : filters.PropertyFilters(symbol)) { - const auto &property = filter.property_filter->property_; - if (context_->db->LabelPropertyIndexExists(GetLabel(label), - GetProperty(property))) { - int64_t vertex_count = context_->db->VerticesCount( - GetLabel(label), GetProperty(property)); - if (!found || vertex_count < found->vertex_count) { - if (filter.property_filter->is_symbol_in_value_) { - // Skip filter expressions which use the symbol whose property - // we are looking up. We cannot scan by such expressions. For - // example, in `n.a = 2 + n.b` both sides of `=` refer to `n`, - // so we cannot scan `n` by property index. - continue; - } - if (are_bound(filter.used_symbols)) { - // Take the property filter which uses bound symbols. - found = LabelPropertyIndex{label, filter, vertex_count}; - } - } - } - } - } - return found; - } - - LabelIx FindBestLabelIndex(const std::unordered_set<LabelIx> &labels) { - CHECK(!labels.empty()) - << "Trying to find the best label without any labels."; - return *std::min_element( - labels.begin(), labels.end(), - [this](const auto &label1, const auto &label2) { - return context_->db->VerticesCount(GetLabel(label1)) < - context_->db->VerticesCount(GetLabel(label2)); - }); - } - - // Creates a ScanAll by the best possible index for the `node_symbol`. Best - // index is defined as the index with least number of vertices. If the node - // does not have at least a label, no indexed lookup can be created and - // `nullptr` is returned. The operator is chained after `last_op`. Optional - // `max_vertex_count` controls, whether no operator should be created if the - // vertex count in the best index exceeds this number. In such a case, - // `nullptr` is returned and `last_op` is not chained. - std::unique_ptr<ScanAll> GenScanByIndex( - std::unique_ptr<LogicalOperator> &last_op, const Symbol &node_symbol, - const MatchContext &match_ctx, Filters &filters, - const std::experimental::optional<int64_t> &max_vertex_count = - std::experimental::nullopt) { - const auto labels = filters.FilteredLabels(node_symbol); - if (labels.empty()) { - // Without labels, we cannot generated any indexed ScanAll. - return nullptr; - } - // First, try to see if we can use label+property index. If not, use just - // the label index (which ought to exist). - auto found_index = FindBestLabelPropertyIndex(node_symbol, filters, - match_ctx.bound_symbols); - if (found_index && - // Use label+property index if we satisfy max_vertex_count. - (!max_vertex_count || *max_vertex_count >= found_index->vertex_count)) { - // Copy the property filter and then erase it from filters. - const auto prop_filter = *found_index->filter.property_filter; - filters.EraseFilter(found_index->filter); - filters.EraseLabelFilter(node_symbol, found_index->label); - if (prop_filter.lower_bound_ || prop_filter.upper_bound_) { - return std::make_unique<ScanAllByLabelPropertyRange>( - std::move(last_op), node_symbol, GetLabel(found_index->label), - GetProperty(prop_filter.property_), prop_filter.property_.name, - prop_filter.lower_bound_, prop_filter.upper_bound_, - match_ctx.graph_view); - } else { - DCHECK(prop_filter.value_) << "Property filter should either have " - "bounds or a value expression."; - return std::make_unique<ScanAllByLabelPropertyValue>( - std::move(last_op), node_symbol, GetLabel(found_index->label), - GetProperty(prop_filter.property_), prop_filter.property_.name, - prop_filter.value_, match_ctx.graph_view); - } - } - auto label = FindBestLabelIndex(labels); - if (max_vertex_count && - context_->db->VerticesCount(GetLabel(label)) > *max_vertex_count) { - // Don't create an indexed lookup, since we have more labeled vertices - // than the allowed count. - return nullptr; - } - filters.EraseLabelFilter(node_symbol, label); - return std::make_unique<ScanAllByLabel>( - std::move(last_op), node_symbol, GetLabel(label), match_ctx.graph_view); - } - std::unique_ptr<LogicalOperator> PlanMatching( MatchContext &match_context, std::unique_ptr<LogicalOperator> input_op) { auto &bound_symbols = match_context.bound_symbols; @@ -503,16 +387,8 @@ class RuleBasedPlanner { const auto &node1_symbol = symbol_table.at(*expansion.node1->identifier_); if (bound_symbols.insert(node1_symbol).second) { // We have just bound this symbol, so generate ScanAll which fills it. - if (auto indexed_scan = - GenScanByIndex(last_op, node1_symbol, match_context, filters)) { - // First, try to get an indexed scan. - last_op = std::move(indexed_scan); - } else { - // If indexed scan is not possible, we can only generate ScanAll of - // everything. - last_op = std::make_unique<ScanAll>(std::move(last_op), node1_symbol, - match_context.graph_view); - } + last_op = std::make_unique<ScanAll>(std::move(last_op), node1_symbol, + match_context.graph_view); match_context.new_symbols.emplace_back(node1_symbol); last_op = impl::GenFilters(std::move(last_op), bound_symbols, filters, storage); @@ -602,22 +478,6 @@ class RuleBasedPlanner { expansion.is_flipped, edge->lower_bound_, edge->upper_bound_, existing_node, filter_lambda, weight_lambda, total_weight); } else { - if (!existing_node) { - // Try to get better behaviour by creating an indexed scan and then - // expanding into existing, instead of letting the Expand iterate - // over all the edges. - // Currently, just use the maximum vertex count flag, below which we - // want to replace Expand with index ScanAll + Expand into existing. - // It would be better to somehow test whether the input vertex - // degree is larger than the destination vertex index count. - auto indexed_scan = - GenScanByIndex(last_op, node_symbol, match_context, filters, - FLAGS_query_vertex_count_to_expand_existing); - if (indexed_scan) { - last_op = std::move(indexed_scan); - existing_node = true; - } - } last_op = std::make_unique<Expand>( std::move(last_op), node1_symbol, node_symbol, edge_symbol, expansion.direction, edge_types, existing_node, diff --git a/tests/manual/distributed_query_planner.cpp b/tests/manual/distributed_query_planner.cpp index 189ce7a23..cf0d72ac7 100644 --- a/tests/manual/distributed_query_planner.cpp +++ b/tests/manual/distributed_query_planner.cpp @@ -15,7 +15,7 @@ DEFCOMMAND(ShowDistributed) { std::stringstream ss(args[0]); ss >> plan_ix; if (ss.fail() || !ss.eof() || plan_ix >= plans.size()) return; - const auto &plan = plans[plan_ix].first; + const auto &plan = plans[plan_ix].final_plan; std::atomic<int64_t> plan_id{0}; std::vector<storage::Property> properties_by_ix = query::NamesToProperties(ast_storage.properties_, &dba); diff --git a/tests/manual/interactive_planning.cpp b/tests/manual/interactive_planning.cpp index aae918ac0..f05851ef2 100644 --- a/tests/manual/interactive_planning.cpp +++ b/tests/manual/interactive_planning.cpp @@ -365,10 +365,9 @@ DEFCOMMAND(Top) { if (ss.fail() || !ss.eof()) return; n_plans = std::min(static_cast<int64_t>(plans.size()), n_plans); for (int64_t i = 0; i < n_plans; ++i) { - auto &plan_pair = plans[i]; std::cout << "---- Plan #" << i << " ---- " << std::endl; - std::cout << "cost: " << plan_pair.second << std::endl; - query::plan::PrettyPrint(dba, plan_pair.first.get()); + std::cout << "cost: " << plans[i].cost << std::endl; + query::plan::PrettyPrint(dba, plans[i].final_plan.get()); std::cout << std::endl; } } @@ -378,17 +377,29 @@ DEFCOMMAND(Show) { std::stringstream ss(args[0]); ss >> plan_ix; if (ss.fail() || !ss.eof() || plan_ix >= plans.size()) return; - const auto &plan = plans[plan_ix].first; - auto cost = plans[plan_ix].second; + const auto &plan = plans[plan_ix].final_plan; + auto cost = plans[plan_ix].cost; std::cout << "Plan cost: " << cost << std::endl; query::plan::PrettyPrint(dba, plan.get()); } +DEFCOMMAND(ShowUnoptimized) { + int64_t plan_ix = 0; + std::stringstream ss(args[0]); + ss >> plan_ix; + if (ss.fail() || !ss.eof() || plan_ix >= plans.size()) return; + const auto &plan = plans[plan_ix].unoptimized_plan; + query::plan::PrettyPrint(dba, plan.get()); +} + DEFCOMMAND(Help); std::map<std::string, Command> commands = { {"top", {TopCommand, 1, "Show top N plans"}}, {"show", {ShowCommand, 1, "Show the Nth plan"}}, + {"show-unoptimized", + {ShowUnoptimizedCommand, 1, + "Show the Nth plan in its original, unoptimized form"}}, {"help", {HelpCommand, 0, "Show available commands"}}, }; @@ -407,11 +418,10 @@ DEFCOMMAND(Help) { } } -void ExaminePlans( - database::GraphDbAccessor &dba, const query::SymbolTable &symbol_table, - std::vector<std::pair<std::unique_ptr<query::plan::LogicalOperator>, - double>> &plans, - const query::AstStorage &ast) { +void ExaminePlans(database::GraphDbAccessor &dba, + const query::SymbolTable &symbol_table, + std::vector<InteractivePlan> &plans, + const query::AstStorage &ast) { while (true) { auto line = ReadLine("plan? "); if (!line || *line == "quit") break; @@ -446,31 +456,36 @@ query::Query *MakeAst(const std::string &query, query::AstStorage *storage) { return visitor.query(); } -// Returns a list of pairs (plan, estimated cost), sorted in the ascending -// order by cost. +// Returns a list of InteractivePlan instances, sorted in the ascending order by +// cost. auto MakeLogicalPlans(query::CypherQuery *query, query::AstStorage &ast, query::SymbolTable &symbol_table, InteractiveDbAccessor *dba) { auto query_parts = query::plan::CollectQueryParts(symbol_table, ast, query); - std::vector<std::pair<std::unique_ptr<query::plan::LogicalOperator>, double>> - plans_with_cost; + std::vector<InteractivePlan> interactive_plans; auto ctx = query::plan::MakePlanningContext(&ast, &symbol_table, query, dba); if (query_parts.query_parts.size() <= 0) { std::cerr << "Failed to extract query parts" << std::endl; std::exit(EXIT_FAILURE); } + query::Parameters parameters; + query::plan::PostProcessor post_process(parameters); auto plans = query::plan::MakeLogicalPlanForSingleQuery< query::plan::VariableStartPlanner>( query_parts.query_parts.at(0).single_query_parts, &ctx); - query::Parameters parameters; for (auto plan : plans) { - auto cost = query::plan::EstimatePlanCost(dba, parameters, *plan); - plans_with_cost.emplace_back(std::move(plan), cost); + query::AstStorage ast_copy; + auto unoptimized_plan = plan->Clone(&ast_copy); + auto rewritten_plan = post_process.Rewrite(std::move(plan), &ctx); + double cost = post_process.EstimatePlanCost(rewritten_plan, dba); + interactive_plans.push_back( + InteractivePlan{std::move(unoptimized_plan), std::move(ast_copy), + std::move(rewritten_plan), cost}); } std::stable_sort( - plans_with_cost.begin(), plans_with_cost.end(), - [](const auto &a, const auto &b) { return a.second < b.second; }); - return plans_with_cost; + interactive_plans.begin(), interactive_plans.end(), + [](const auto &a, const auto &b) { return a.cost < b.cost; }); + return interactive_plans; } void RunInteractivePlanning(database::GraphDbAccessor *dba) { diff --git a/tests/manual/interactive_planning.hpp b/tests/manual/interactive_planning.hpp index 05c9e7adc..2577eac4b 100644 --- a/tests/manual/interactive_planning.hpp +++ b/tests/manual/interactive_planning.hpp @@ -13,10 +13,18 @@ namespace database { class GraphDbAccessor; } -// Shorthand for a vector of pairs (logical_plan, cost). -typedef std::vector< - std::pair<std::unique_ptr<query::plan::LogicalOperator>, double>> - PlansWithCost; +struct InteractivePlan { + // Original plan after going only through the RuleBasedPlanner. + std::unique_ptr<query::plan::LogicalOperator> unoptimized_plan; + // Storage for the AST used in unoptimized_plan + query::AstStorage ast_storage; + // Final plan after being rewritten and optimized. + std::unique_ptr<query::plan::LogicalOperator> final_plan; + // Cost of the final plan. + double cost; +}; + +typedef std::vector<InteractivePlan> PlansWithCost; // Encapsulates a consoles command function. struct Command { diff --git a/tests/unit/distributed_query_plan.cpp b/tests/unit/distributed_query_plan.cpp index d7003d32a..86e89e405 100644 --- a/tests/unit/distributed_query_plan.cpp +++ b/tests/unit/distributed_query_plan.cpp @@ -594,8 +594,12 @@ class CapnpPlanner { PlanningContext<TDbAccessor> context) { ::capnp::MallocMessageBuilder message; { + query::Parameters parameters; + PostProcessor post_processor(parameters); auto original_plan = MakeLogicalPlanForSingleQuery<RuleBasedPlanner>( single_query_parts, &context); + original_plan = + post_processor.Rewrite(std::move(original_plan), &context); SavePlan(*original_plan, &message); } { @@ -710,7 +714,11 @@ class Planner { Planner(std::vector<SingleQueryPart> single_query_parts, PlanningContext<TDbAccessor> context) : plan_(MakeLogicalPlanForSingleQuery<RuleBasedPlanner>( - single_query_parts, &context)) {} + single_query_parts, &context)) { + query::Parameters parameters; + PostProcessor post_processor(parameters); + plan_ = post_processor.Rewrite(std::move(plan_), &context); + } auto &plan() { return *plan_; } diff --git a/tests/unit/query_plan.cpp b/tests/unit/query_plan.cpp index c77fe5f2b..e48c895ad 100644 --- a/tests/unit/query_plan.cpp +++ b/tests/unit/query_plan.cpp @@ -44,8 +44,11 @@ class Planner { template <class TDbAccessor> Planner(std::vector<SingleQueryPart> single_query_parts, PlanningContext<TDbAccessor> context) { + query::Parameters parameters; + PostProcessor post_processor(parameters); plan_ = MakeLogicalPlanForSingleQuery<RuleBasedPlanner>(single_query_parts, &context); + plan_ = post_processor.Rewrite(std::move(plan_), &context); } auto &plan() { return *plan_; }