From 9629f10166255a026aa60a4b80efdad9876d9c37 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ante=20Pu=C5=A1i=C4=87?= Date: Wed, 20 Mar 2024 10:29:24 +0100 Subject: [PATCH] Text search (#1603, #1739) Add text search: * named property search * all-property search * regex search * aggregation over search results Text search works with: * non-parallel transactions * durability (WAL files and snapshots) * multitenancy --- include/_mgp.hpp | 15 + include/mg_procedure.h | 32 +- include/mgp.hpp | 79 +++- libs/CMakeLists.txt | 26 ++ query_modules/CMakeLists.txt | 18 + query_modules/text_search_module.cpp | 149 ++++++ src/CMakeLists.txt | 2 +- src/dbms/dbms_handler.hpp | 2 +- src/dbms/inmemory/replication_handlers.cpp | 11 + src/flags/experimental.cpp | 11 +- src/flags/experimental.hpp | 3 +- src/flags/run_time_configurable.cpp | 12 +- src/flags/run_time_configurable.hpp | 2 +- src/query/db_accessor.hpp | 24 + src/query/dump.cpp | 34 ++ src/query/dump.hpp | 1 + src/query/exceptions.hpp | 13 + src/query/frontend/ast/ast.cpp | 3 + src/query/frontend/ast/ast.hpp | 31 ++ src/query/frontend/ast/ast_visitor.hpp | 13 +- .../frontend/ast/cypher_main_visitor.cpp | 22 + .../frontend/ast/cypher_main_visitor.hpp | 17 +- .../frontend/opencypher/grammar/Cypher.g4 | 9 + .../opencypher/grammar/CypherLexer.g4 | 1 + .../opencypher/grammar/MemgraphCypher.g4 | 1 + .../frontend/semantic/required_privileges.cpp | 2 + src/query/frontend/semantic/symbol.hpp | 2 +- src/query/interpreter.cpp | 86 +++- src/query/plan/operator.cpp | 25 + src/query/plan/vertex_count_cache.hpp | 2 +- src/query/procedure/mg_procedure_impl.cpp | 161 ++++++- src/query/procedure/mg_procedure_impl.hpp | 9 +- src/storage/v2/CMakeLists.txt | 4 +- src/storage/v2/disk/durable_metadata.cpp | 28 +- src/storage/v2/disk/durable_metadata.hpp | 6 +- src/storage/v2/disk/storage.cpp | 27 +- src/storage/v2/durability/durability.cpp | 37 +- src/storage/v2/durability/durability.hpp | 5 +- src/storage/v2/durability/marker.hpp | 4 + src/storage/v2/durability/metadata.hpp | 1 + src/storage/v2/durability/serialization.cpp | 4 + src/storage/v2/durability/snapshot.cpp | 30 ++ .../durability/storage_global_operation.hpp | 2 + src/storage/v2/durability/wal.cpp | 65 ++- src/storage/v2/durability/wal.hpp | 17 +- src/storage/v2/indices/indices.cpp | 1 + src/storage/v2/indices/indices.hpp | 6 +- src/storage/v2/indices/text_index.cpp | 430 ++++++++++++++++++ src/storage/v2/indices/text_index.hpp | 105 +++++ src/storage/v2/inmemory/storage.cpp | 41 +- src/storage/v2/inmemory/storage.hpp | 8 +- src/storage/v2/metadata_delta.hpp | 19 + src/storage/v2/property_store.cpp | 2 +- .../v2/replication/replication_client.cpp | 5 +- src/storage/v2/storage.cpp | 23 + src/storage/v2/storage.hpp | 31 ++ src/utils/event_counter.cpp | 3 +- src/utils/typeinfo.hpp | 1 + tests/e2e/configuration/default_config.py | 2 +- tests/e2e/text_search/CMakeLists.txt | 6 + tests/e2e/text_search/common.py | 87 ++++ tests/e2e/text_search/test_text_search.py | 206 +++++++++ .../text_search/test_text_search_disabled.py | 69 +++ tests/e2e/text_search/workloads.yaml | 33 ++ tests/unit/query_dump.cpp | 21 +- tests/unit/storage_v2_decoder_encoder.cpp | 2 + tests/unit/storage_v2_get_info.cpp | 1 + tests/unit/storage_v2_wal_file.cpp | 10 +- 68 files changed, 2088 insertions(+), 72 deletions(-) create mode 100644 query_modules/text_search_module.cpp create mode 100644 src/storage/v2/indices/text_index.cpp create mode 100644 src/storage/v2/indices/text_index.hpp create mode 100644 tests/e2e/text_search/CMakeLists.txt create mode 100644 tests/e2e/text_search/common.py create mode 100644 tests/e2e/text_search/test_text_search.py create mode 100644 tests/e2e/text_search/test_text_search_disabled.py create mode 100644 tests/e2e/text_search/workloads.yaml diff --git a/include/_mgp.hpp b/include/_mgp.hpp index 8b67bc36a..b1d9e26d5 100644 --- a/include/_mgp.hpp +++ b/include/_mgp.hpp @@ -326,6 +326,21 @@ inline mgp_vertex *graph_get_vertex_by_id(mgp_graph *g, mgp_vertex_id id, mgp_me return MgInvoke(mgp_graph_get_vertex_by_id, g, id, memory); } +inline bool graph_has_text_index(mgp_graph *graph, const char *index_name) { + return MgInvoke(mgp_graph_has_text_index, graph, index_name); +} + +inline mgp_map *graph_search_text_index(mgp_graph *graph, const char *index_name, const char *search_query, + text_search_mode search_mode, mgp_memory *memory) { + return MgInvoke(mgp_graph_search_text_index, graph, index_name, search_query, search_mode, memory); +} + +inline mgp_map *graph_aggregate_over_text_index(mgp_graph *graph, const char *index_name, const char *search_query, + const char *aggregation_query, mgp_memory *memory) { + return MgInvoke(mgp_graph_aggregate_over_text_index, graph, index_name, search_query, aggregation_query, + memory); +} + inline mgp_vertices_iterator *graph_iter_vertices(mgp_graph *g, mgp_memory *memory) { return MgInvoke(mgp_graph_iter_vertices, g, memory); } diff --git a/include/mg_procedure.h b/include/mg_procedure.h index 93ef241d8..117dc66ab 100644 --- a/include/mg_procedure.h +++ b/include/mg_procedure.h @@ -1,4 +1,4 @@ -// Copyright 2023 Memgraph Ltd. +// Copyright 2024 Memgraph Ltd. // // Use of this software is governed by the Business Source License // included in the file licenses/BSL.txt; by using this file, you agree to be bound by the terms of the Business Source @@ -891,6 +891,36 @@ enum mgp_error mgp_edge_iter_properties(struct mgp_edge *e, struct mgp_memory *m enum mgp_error mgp_graph_get_vertex_by_id(struct mgp_graph *g, struct mgp_vertex_id id, struct mgp_memory *memory, struct mgp_vertex **result); +/// Result is non-zero if the index with the given name exists. +/// The current implementation always returns without errors. +enum mgp_error mgp_graph_has_text_index(struct mgp_graph *graph, const char *index_name, int *result); + +/// Available modes of searching text indices. +MGP_ENUM_CLASS text_search_mode{ + SPECIFIED_PROPERTIES, + REGEX, + ALL_PROPERTIES, +}; + +/// Search the named text index for the given query. The result is a map with the "search_results" and "error_msg" keys. +/// The "search_results" key contains the vertices whose text-indexed properties match the given query. +/// In case of a Tantivy error, the "search_results" key is absent, and "error_msg" contains the error message. +/// Return mgp_error::MGP_ERROR_UNABLE_TO_ALLOCATE if there’s an allocation error while constructing the results map. +/// Return mgp_error::MGP_ERROR_KEY_ALREADY_EXISTS if the same key is being created in the results map more than once. +enum mgp_error mgp_graph_search_text_index(struct mgp_graph *graph, const char *index_name, const char *search_query, + enum text_search_mode search_mode, struct mgp_memory *memory, + struct mgp_map **result); + +/// Aggregate over the results of a search over the named text index. The result is a map with the "aggregation_results" +/// and "error_msg" keys. +/// The "aggregation_results" key contains the vertices whose text-indexed properties match the given query. +/// In case of a Tantivy error, the "aggregation_results" key is absent, and "error_msg" contains the error message. +/// Return mgp_error::MGP_ERROR_UNABLE_TO_ALLOCATE if there’s an allocation error while constructing the results map. +/// Return mgp_error::MGP_ERROR_KEY_ALREADY_EXISTS if the same key is being created in the results map more than once. +enum mgp_error mgp_graph_aggregate_over_text_index(struct mgp_graph *graph, const char *index_name, + const char *search_query, const char *aggregation_query, + struct mgp_memory *memory, struct mgp_map **result); + /// Creates label index for given label. /// mgp_error::MGP_ERROR_NO_ERROR is always returned. /// if label index already exists, result will be 0, otherwise 1. diff --git a/include/mgp.hpp b/include/mgp.hpp index 3f7ed591e..f35231062 100644 --- a/include/mgp.hpp +++ b/include/mgp.hpp @@ -1,4 +1,4 @@ -// Copyright 2023 Memgraph Ltd. +// Copyright 2024 Memgraph Ltd. // // Use of this software is governed by the Business Source License // included in the file licenses/BSL.txt; by using this file, you agree to be bound by the terms of the Business Source @@ -32,6 +32,15 @@ namespace mgp { +class TextSearchException : public std::exception { + public: + explicit TextSearchException(std::string message) : message_(std::move(message)) {} + const char *what() const noexcept override { return message_.c_str(); } + + private: + std::string message_; +}; + class IndexException : public std::exception { public: explicit IndexException(std::string message) : message_(std::move(message)) {} @@ -4306,12 +4315,12 @@ inline void AddParamsReturnsToProc(mgp_proc *proc, std::vector ¶m } } // namespace detail -inline bool CreateLabelIndex(mgp_graph *memgaph_graph, const std::string_view label) { - return create_label_index(memgaph_graph, label.data()); +inline bool CreateLabelIndex(mgp_graph *memgraph_graph, const std::string_view label) { + return create_label_index(memgraph_graph, label.data()); } -inline bool DropLabelIndex(mgp_graph *memgaph_graph, const std::string_view label) { - return drop_label_index(memgaph_graph, label.data()); +inline bool DropLabelIndex(mgp_graph *memgraph_graph, const std::string_view label) { + return drop_label_index(memgraph_graph, label.data()); } inline List ListAllLabelIndices(mgp_graph *memgraph_graph) { @@ -4322,14 +4331,14 @@ inline List ListAllLabelIndices(mgp_graph *memgraph_graph) { return List(label_indices); } -inline bool CreateLabelPropertyIndex(mgp_graph *memgaph_graph, const std::string_view label, +inline bool CreateLabelPropertyIndex(mgp_graph *memgraph_graph, const std::string_view label, const std::string_view property) { - return create_label_property_index(memgaph_graph, label.data(), property.data()); + return create_label_property_index(memgraph_graph, label.data(), property.data()); } -inline bool DropLabelPropertyIndex(mgp_graph *memgaph_graph, const std::string_view label, +inline bool DropLabelPropertyIndex(mgp_graph *memgraph_graph, const std::string_view label, const std::string_view property) { - return drop_label_property_index(memgaph_graph, label.data(), property.data()); + return drop_label_property_index(memgraph_graph, label.data(), property.data()); } inline List ListAllLabelPropertyIndices(mgp_graph *memgraph_graph) { @@ -4340,6 +4349,58 @@ inline List ListAllLabelPropertyIndices(mgp_graph *memgraph_graph) { return List(label_property_indices); } +namespace { +constexpr std::string_view kErrorMsgKey = "error_msg"; +constexpr std::string_view kSearchResultsKey = "search_results"; +constexpr std::string_view kAggregationResultsKey = "aggregation_results"; +} // namespace + +inline List SearchTextIndex(mgp_graph *memgraph_graph, std::string_view index_name, std::string_view search_query, + text_search_mode search_mode) { + auto results_or_error = Map(mgp::MemHandlerCallback(graph_search_text_index, memgraph_graph, index_name.data(), + search_query.data(), search_mode)); + if (results_or_error.KeyExists(kErrorMsgKey)) { + if (!results_or_error.At(kErrorMsgKey).IsString()) { + throw TextSearchException{"The error message is not a string!"}; + } + throw TextSearchException(results_or_error.At(kErrorMsgKey).ValueString().data()); + } + + if (!results_or_error.KeyExists(kSearchResultsKey)) { + throw TextSearchException{"Incomplete text index search results!"}; + } + + if (!results_or_error.At(kSearchResultsKey).IsList()) { + throw TextSearchException{"Text index search results have wrong type!"}; + } + + return results_or_error.At(kSearchResultsKey).ValueList(); +} + +inline std::string_view AggregateOverTextIndex(mgp_graph *memgraph_graph, std::string_view index_name, + std::string_view search_query, std::string_view aggregation_query) { + auto results_or_error = + Map(mgp::MemHandlerCallback(graph_aggregate_over_text_index, memgraph_graph, index_name.data(), + search_query.data(), aggregation_query.data())); + + if (results_or_error.KeyExists(kErrorMsgKey)) { + if (!results_or_error.At(kErrorMsgKey).IsString()) { + throw TextSearchException{"The error message is not a string!"}; + } + throw TextSearchException(results_or_error.At(kErrorMsgKey).ValueString().data()); + } + + if (!results_or_error.KeyExists(kAggregationResultsKey)) { + throw TextSearchException{"Incomplete text index aggregation results!"}; + } + + if (!results_or_error.At(kAggregationResultsKey).IsString()) { + throw TextSearchException{"Text index aggregation results have wrong type!"}; + } + + return results_or_error.At(kAggregationResultsKey).ValueString(); +} + inline bool CreateExistenceConstraint(mgp_graph *memgraph_graph, const std::string_view label, const std::string_view property) { return create_existence_constraint(memgraph_graph, label.data(), property.data()); diff --git a/libs/CMakeLists.txt b/libs/CMakeLists.txt index 7d568d548..ab6a313f1 100644 --- a/libs/CMakeLists.txt +++ b/libs/CMakeLists.txt @@ -295,6 +295,32 @@ set_path_external_library(jemalloc STATIC import_header_library(rangev3 ${CMAKE_CURRENT_SOURCE_DIR}/rangev3/include) +ExternalProject_Add(mgcxx-proj + PREFIX mgcxx-proj + GIT_REPOSITORY https://github.com/memgraph/mgcxx + GIT_TAG "v0.0.4" + CMAKE_ARGS + "-DCMAKE_INSTALL_PREFIX=" + "-DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}" + "-DENABLE_TESTS=OFF" + INSTALL_DIR "${PROJECT_BINARY_DIR}/mgcxx" +) +ExternalProject_Get_Property(mgcxx-proj install_dir) +set(MGCXX_ROOT ${install_dir}) + +add_library(tantivy_text_search STATIC IMPORTED GLOBAL) +add_dependencies(tantivy_text_search mgcxx-proj) +set_property(TARGET tantivy_text_search PROPERTY IMPORTED_LOCATION ${MGCXX_ROOT}/lib/libtantivy_text_search.a) + +add_library(mgcxx_text_search STATIC IMPORTED GLOBAL) +add_dependencies(mgcxx_text_search mgcxx-proj) +set_property(TARGET mgcxx_text_search PROPERTY IMPORTED_LOCATION ${MGCXX_ROOT}/lib/libmgcxx_text_search.a) +# We need to create the include directory first in order to be able to add it +# as an include directory. The header files in the include directory will be +# generated later during the build process. +file(MAKE_DIRECTORY ${MGCXX_ROOT}/include) +set_property(TARGET mgcxx_text_search PROPERTY INTERFACE_INCLUDE_DIRECTORIES ${MGCXX_ROOT}/include) + # Setup NuRaft import_external_library(nuraft STATIC ${CMAKE_CURRENT_SOURCE_DIR}/nuraft/lib/libnuraft.a diff --git a/query_modules/CMakeLists.txt b/query_modules/CMakeLists.txt index 41dbb495c..1336f3eb0 100644 --- a/query_modules/CMakeLists.txt +++ b/query_modules/CMakeLists.txt @@ -6,6 +6,8 @@ project(memgraph_query_modules) disallow_in_source_build() +find_package(fmt REQUIRED) + # Everything that is installed here, should be under the "query_modules" component. set(CMAKE_INSTALL_DEFAULT_COMPONENT_NAME "query_modules") string(TOLOWER ${CMAKE_BUILD_TYPE} lower_build_type) @@ -58,6 +60,22 @@ install(PROGRAMS $ # Also install the source of the example, so user can read it. install(FILES schema.cpp DESTINATION lib/memgraph/query_modules/src) +add_library(text SHARED text_search_module.cpp) +target_include_directories(text PRIVATE ${CMAKE_SOURCE_DIR}/include) +target_compile_options(text PRIVATE -Wall) +target_link_libraries(text PRIVATE -static-libgcc -static-libstdc++ fmt::fmt) +# Strip C++ example in release build. +if (lower_build_type STREQUAL "release") + add_custom_command(TARGET text POST_BUILD + COMMAND strip -s $ + COMMENT "Stripping symbols and sections from the C++ text_search module") +endif() +install(PROGRAMS $ + DESTINATION lib/memgraph/query_modules + RENAME text.so) +# Also install the source of the example, so user can read it. +install(FILES text_search_module.cpp DESTINATION lib/memgraph/query_modules/src) + # Install the Python example and modules install(FILES example.py DESTINATION lib/memgraph/query_modules RENAME py_example.py) install(FILES graph_analyzer.py DESTINATION lib/memgraph/query_modules) diff --git a/query_modules/text_search_module.cpp b/query_modules/text_search_module.cpp new file mode 100644 index 000000000..8e4405058 --- /dev/null +++ b/query_modules/text_search_module.cpp @@ -0,0 +1,149 @@ +// Copyright 2024 Memgraph Ltd. +// +// Use of this software is governed by the Business Source License +// included in the file licenses/BSL.txt; by using this file, you agree to be bound by the terms of the Business Source +// License, and you may not use this file except in compliance with the Business Source License. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0, included in the file +// licenses/APL.txt. + +#include +#include + +#include + +#include + +namespace TextSearch { +constexpr std::string_view kProcedureSearch = "search"; +constexpr std::string_view kProcedureRegexSearch = "regex_search"; +constexpr std::string_view kProcedureSearchAllProperties = "search_all"; +constexpr std::string_view kProcedureAggregate = "aggregate"; +constexpr std::string_view kParameterIndexName = "index_name"; +constexpr std::string_view kParameterSearchQuery = "search_query"; +constexpr std::string_view kParameterAggregationQuery = "aggregation_query"; +constexpr std::string_view kReturnNode = "node"; +constexpr std::string_view kReturnAggregation = "aggregation"; +const std::string kSearchAllPrefix = "all"; + +void Search(mgp_list *args, mgp_graph *memgraph_graph, mgp_result *result, mgp_memory *memory); +void RegexSearch(mgp_list *args, mgp_graph *memgraph_graph, mgp_result *result, mgp_memory *memory); +void SearchAllProperties(mgp_list *args, mgp_graph *memgraph_graph, mgp_result *result, mgp_memory *memory); +void Aggregate(mgp_list *args, mgp_graph *memgraph_graph, mgp_result *result, mgp_memory *memory); +} // namespace TextSearch + +void TextSearch::Search(mgp_list *args, mgp_graph *memgraph_graph, mgp_result *result, mgp_memory *memory) { + mgp::MemoryDispatcherGuard guard{memory}; + const auto record_factory = mgp::RecordFactory(result); + auto arguments = mgp::List(args); + + try { + const auto *index_name = arguments[0].ValueString().data(); + const auto *search_query = arguments[1].ValueString().data(); + for (const auto &node : + mgp::SearchTextIndex(memgraph_graph, index_name, search_query, text_search_mode::SPECIFIED_PROPERTIES)) { + auto record = record_factory.NewRecord(); + record.Insert(TextSearch::kReturnNode.data(), node.ValueNode()); + } + } catch (const std::exception &e) { + record_factory.SetErrorMessage(e.what()); + } +} + +void TextSearch::RegexSearch(mgp_list *args, mgp_graph *memgraph_graph, mgp_result *result, mgp_memory *memory) { + mgp::MemoryDispatcherGuard guard{memory}; + const auto record_factory = mgp::RecordFactory(result); + auto arguments = mgp::List(args); + + try { + const auto *index_name = arguments[0].ValueString().data(); + const auto *search_query = arguments[1].ValueString().data(); + for (const auto &node : mgp::SearchTextIndex(memgraph_graph, index_name, search_query, text_search_mode::REGEX)) { + auto record = record_factory.NewRecord(); + record.Insert(TextSearch::kReturnNode.data(), node.ValueNode()); + } + } catch (const std::exception &e) { + record_factory.SetErrorMessage(e.what()); + } +} + +void TextSearch::SearchAllProperties(mgp_list *args, mgp_graph *memgraph_graph, mgp_result *result, + mgp_memory *memory) { + mgp::MemoryDispatcherGuard guard{memory}; + const auto record_factory = mgp::RecordFactory(result); + auto arguments = mgp::List(args); + + try { + const auto *index_name = arguments[0].ValueString().data(); + const auto *search_query = fmt::format("{}:{}", kSearchAllPrefix, arguments[1].ValueString()).data(); + for (const auto &node : + mgp::SearchTextIndex(memgraph_graph, index_name, search_query, text_search_mode::ALL_PROPERTIES)) { + auto record = record_factory.NewRecord(); + record.Insert(TextSearch::kReturnNode.data(), node.ValueNode()); + } + } catch (const std::exception &e) { + record_factory.SetErrorMessage(e.what()); + } +} + +void TextSearch::Aggregate(mgp_list *args, mgp_graph *memgraph_graph, mgp_result *result, mgp_memory *memory) { + mgp::MemoryDispatcherGuard guard{memory}; + const auto record_factory = mgp::RecordFactory(result); + auto arguments = mgp::List(args); + + try { + const auto *index_name = arguments[0].ValueString().data(); + const auto *search_query = arguments[1].ValueString().data(); + const auto *aggregation_query = arguments[2].ValueString().data(); + const auto aggregation_result = + mgp::AggregateOverTextIndex(memgraph_graph, index_name, search_query, aggregation_query); + auto record = record_factory.NewRecord(); + record.Insert(TextSearch::kReturnAggregation.data(), aggregation_result.data()); + } catch (const std::exception &e) { + record_factory.SetErrorMessage(e.what()); + } +} + +extern "C" int mgp_init_module(struct mgp_module *module, struct mgp_memory *memory) { + try { + mgp::MemoryDispatcherGuard guard{memory}; + + AddProcedure(TextSearch::Search, TextSearch::kProcedureSearch, mgp::ProcedureType::Read, + { + mgp::Parameter(TextSearch::kParameterIndexName, mgp::Type::String), + mgp::Parameter(TextSearch::kParameterSearchQuery, mgp::Type::String), + }, + {mgp::Return(TextSearch::kReturnNode, mgp::Type::Node)}, module, memory); + + AddProcedure(TextSearch::RegexSearch, TextSearch::kProcedureRegexSearch, mgp::ProcedureType::Read, + { + mgp::Parameter(TextSearch::kParameterIndexName, mgp::Type::String), + mgp::Parameter(TextSearch::kParameterSearchQuery, mgp::Type::String), + }, + {mgp::Return(TextSearch::kReturnNode, mgp::Type::Node)}, module, memory); + + AddProcedure(TextSearch::SearchAllProperties, TextSearch::kProcedureSearchAllProperties, mgp::ProcedureType::Read, + { + mgp::Parameter(TextSearch::kParameterIndexName, mgp::Type::String), + mgp::Parameter(TextSearch::kParameterSearchQuery, mgp::Type::String), + }, + {mgp::Return(TextSearch::kReturnNode, mgp::Type::Node)}, module, memory); + + AddProcedure(TextSearch::Aggregate, TextSearch::kProcedureAggregate, mgp::ProcedureType::Read, + { + mgp::Parameter(TextSearch::kParameterIndexName, mgp::Type::String), + mgp::Parameter(TextSearch::kParameterSearchQuery, mgp::Type::String), + mgp::Parameter(TextSearch::kParameterAggregationQuery, mgp::Type::String), + }, + {mgp::Return(TextSearch::kReturnAggregation, mgp::Type::String)}, module, memory); + } catch (const std::exception &e) { + std::cerr << "Error while initializing query module: " << e.what() << std::endl; + return 1; + } + + return 0; +} + +extern "C" int mgp_shutdown_module() { return 0; } diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 4d5d523c6..af88e624a 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -45,7 +45,7 @@ set(mg_single_node_v2_sources add_executable(memgraph ${mg_single_node_v2_sources}) target_include_directories(memgraph PUBLIC ${CMAKE_SOURCE_DIR}/include) target_link_libraries(memgraph stdc++fs Threads::Threads - mg-telemetry mg-communication mg-communication-metrics mg-memory mg-utils mg-license mg-settings mg-glue mg-flags mg::system mg::replication_handler) + mg-telemetry mgcxx_text_search tantivy_text_search mg-communication mg-communication-metrics mg-memory mg-utils mg-license mg-settings mg-glue mg-flags mg::system mg::replication_handler) # NOTE: `include/mg_procedure.syms` describes a pattern match for symbols which # should be dynamically exported, so that `dlopen` can correctly link th diff --git a/src/dbms/dbms_handler.hpp b/src/dbms/dbms_handler.hpp index b0bbd5758..482423ebf 100644 --- a/src/dbms/dbms_handler.hpp +++ b/src/dbms/dbms_handler.hpp @@ -311,7 +311,7 @@ class DbmsHandler { stats.triggers += info.triggers; stats.streams += info.streams; ++stats.num_databases; - stats.indices += storage_info.label_indices + storage_info.label_property_indices; + stats.indices += storage_info.label_indices + storage_info.label_property_indices + storage_info.text_indices; stats.constraints += storage_info.existence_constraints + storage_info.unique_constraints; ++stats.storage_modes[(int)storage_info.storage_mode]; ++stats.isolation_levels[(int)storage_info.isolation_level]; diff --git a/src/dbms/inmemory/replication_handlers.cpp b/src/dbms/inmemory/replication_handlers.cpp index 69f04914c..f9ce7a9d8 100644 --- a/src/dbms/inmemory/replication_handlers.cpp +++ b/src/dbms/inmemory/replication_handlers.cpp @@ -615,6 +615,7 @@ uint64_t InMemoryReplicationHandlers::ReadAndApplyDelta(storage::InMemoryStorage auto vertex = transaction->FindVertex(delta.vertex_add_remove_label.gid, View::NEW); if (!vertex) throw utils::BasicException("Invalid transaction! Please raise an issue, {}:{}", __FILE__, __LINE__); + // NOTE: Text search doesn’t have replication in scope yet (Phases 1 and 2) auto ret = vertex->AddLabel(transaction->NameToLabel(delta.vertex_add_remove_label.label)); if (ret.HasError() || !ret.GetValue()) throw utils::BasicException("Invalid transaction! Please raise an issue, {}:{}", __FILE__, __LINE__); @@ -627,6 +628,7 @@ uint64_t InMemoryReplicationHandlers::ReadAndApplyDelta(storage::InMemoryStorage auto vertex = transaction->FindVertex(delta.vertex_add_remove_label.gid, View::NEW); if (!vertex) throw utils::BasicException("Invalid transaction! Please raise an issue, {}:{}", __FILE__, __LINE__); + // NOTE: Text search doesn’t have replication in scope yet (Phases 1 and 2) auto ret = vertex->RemoveLabel(transaction->NameToLabel(delta.vertex_add_remove_label.label)); if (ret.HasError() || !ret.GetValue()) throw utils::BasicException("Invalid transaction! Please raise an issue, {}:{}", __FILE__, __LINE__); @@ -640,6 +642,7 @@ uint64_t InMemoryReplicationHandlers::ReadAndApplyDelta(storage::InMemoryStorage auto vertex = transaction->FindVertex(delta.vertex_edge_set_property.gid, View::NEW); if (!vertex) throw utils::BasicException("Invalid transaction! Please raise an issue, {}:{}", __FILE__, __LINE__); + // NOTE: Phase 1 of the text search feature doesn't have replication in scope auto ret = vertex->SetProperty(transaction->NameToProperty(delta.vertex_edge_set_property.property), delta.vertex_edge_set_property.value); if (ret.HasError()) @@ -853,6 +856,14 @@ uint64_t InMemoryReplicationHandlers::ReadAndApplyDelta(storage::InMemoryStorage throw utils::BasicException("Invalid transaction! Please raise an issue, {}:{}", __FILE__, __LINE__); break; } + case WalDeltaData::Type::TEXT_INDEX_CREATE: { + // NOTE: Text search doesn’t have replication in scope yet (Phases 1 and 2) + break; + } + case WalDeltaData::Type::TEXT_INDEX_DROP: { + // NOTE: Text search doesn’t have replication in scope yet (Phases 1 and 2) + break; + } case WalDeltaData::Type::EXISTENCE_CONSTRAINT_CREATE: { spdlog::trace(" Create existence constraint on :{} ({})", delta.operation_label_property.label, delta.operation_label_property.property); diff --git a/src/flags/experimental.cpp b/src/flags/experimental.cpp index 123903c96..8c29142a1 100644 --- a/src/flags/experimental.cpp +++ b/src/flags/experimental.cpp @@ -18,14 +18,15 @@ // Bolt server flags. // NOLINTNEXTLINE (cppcoreguidelines-avoid-non-const-global-variables) -DEFINE_string(experimental_enabled, "", - "Experimental features to be used, comma seperated. Options [system-replication, high-availability]"); - +DEFINE_string( + experimental_enabled, "", + "Experimental features to be used, comma-separated. Options [system-replication, text-search, high-availability]"); using namespace std::string_view_literals; namespace memgraph::flags { auto const mapping = std::map{std::pair{"system-replication"sv, Experiments::SYSTEM_REPLICATION}, + std::pair{"text-search"sv, Experiments::TEXT_SEARCH}, std::pair{"high-availability"sv, Experiments::HIGH_AVAILABILITY}}; auto ExperimentsInstance() -> Experiments & { @@ -45,7 +46,7 @@ bool AreExperimentsEnabled(Experiments experiments) { void InitializeExperimental() { namespace rv = ranges::views; - auto const connonicalize_string = [](auto &&rng) { + auto const canonicalize_string = [](auto &&rng) { auto const is_space = [](auto c) { return c == ' '; }; auto const to_lower = [](unsigned char c) { return std::tolower(c); }; @@ -56,7 +57,7 @@ void InitializeExperimental() { auto const mapping_end = mapping.cend(); using underlying_type = std::underlying_type_t; auto to_set = underlying_type{}; - for (auto &&experiment : FLAGS_experimental_enabled | rv::split(',') | rv::transform(connonicalize_string)) { + for (auto &&experiment : FLAGS_experimental_enabled | rv::split(',') | rv::transform(canonicalize_string)) { if (auto it = mapping.find(experiment); it != mapping_end) { to_set |= static_cast(it->second); } diff --git a/src/flags/experimental.hpp b/src/flags/experimental.hpp index 5a19889fe..0b209a4e8 100644 --- a/src/flags/experimental.hpp +++ b/src/flags/experimental.hpp @@ -23,7 +23,8 @@ namespace memgraph::flags { // old experiments can be reused once code cleanup has happened enum class Experiments : uint8_t { SYSTEM_REPLICATION = 1 << 0, - HIGH_AVAILABILITY = 1 << 1, + TEXT_SEARCH = 1 << 1, + HIGH_AVAILABILITY = 1 << 2, }; bool AreExperimentsEnabled(Experiments experiments); diff --git a/src/flags/run_time_configurable.cpp b/src/flags/run_time_configurable.cpp index a42ebd3d0..6c0fc54ac 100644 --- a/src/flags/run_time_configurable.cpp +++ b/src/flags/run_time_configurable.cpp @@ -1,4 +1,4 @@ -// Copyright 2023 Memgraph Ltd. +// Copyright 2024 Memgraph Ltd. // // Use of this software is governed by the Business Source License // included in the file licenses/BSL.txt; by using this file, you agree to be bound by the terms of the Business Source @@ -73,11 +73,11 @@ constexpr auto kLogToStderrGFlagsKey = "also_log_to_stderr"; constexpr auto kCartesianProductEnabledSettingKey = "cartesian-product-enabled"; constexpr auto kCartesianProductEnabledGFlagsKey = "cartesian-product-enabled"; -// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables) -std::atomic execution_timeout_sec_; // Local cache-like thing - -// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables) -std::atomic cartesian_product_enabled_{true}; // Local cache-like thing +// NOLINTBEGIN(cppcoreguidelines-avoid-non-const-global-variables) +// Local cache-like thing +std::atomic execution_timeout_sec_; +std::atomic cartesian_product_enabled_{true}; +// NOLINTEND(cppcoreguidelines-avoid-non-const-global-variables) auto ToLLEnum(std::string_view val) { const auto ll_enum = memgraph::flags::LogLevelToEnum(val); diff --git a/src/flags/run_time_configurable.hpp b/src/flags/run_time_configurable.hpp index 944a0539f..b215d6540 100644 --- a/src/flags/run_time_configurable.hpp +++ b/src/flags/run_time_configurable.hpp @@ -1,4 +1,4 @@ -// Copyright 2023 Memgraph Ltd. +// Copyright 2024 Memgraph Ltd. // // Use of this software is governed by the Business Source License // included in the file licenses/BSL.txt; by using this file, you agree to be bound by the terms of the Business Source diff --git a/src/query/db_accessor.hpp b/src/query/db_accessor.hpp index 915ea9936..ee4988e4a 100644 --- a/src/query/db_accessor.hpp +++ b/src/query/db_accessor.hpp @@ -634,6 +634,24 @@ class DbAccessor final { bool EdgeTypeIndexExists(storage::EdgeTypeId edge_type) const { return accessor_->EdgeTypeIndexExists(edge_type); } + bool TextIndexExists(const std::string &index_name) const { return accessor_->TextIndexExists(index_name); } + + void TextIndexAddVertex(const VertexAccessor &vertex) { accessor_->TextIndexAddVertex(vertex.impl_); } + + void TextIndexUpdateVertex(const VertexAccessor &vertex, const std::vector &removed_labels = {}) { + accessor_->TextIndexUpdateVertex(vertex.impl_, removed_labels); + } + + std::vector TextIndexSearch(const std::string &index_name, const std::string &search_query, + text_search_mode search_mode) const { + return accessor_->TextIndexSearch(index_name, search_query, search_mode); + } + + std::string TextIndexAggregate(const std::string &index_name, const std::string &search_query, + const std::string &aggregation_query) const { + return accessor_->TextIndexAggregate(index_name, search_query, aggregation_query); + } + std::optional GetIndexStats(const storage::LabelId &label) const { return accessor_->GetIndexStats(label); } @@ -717,6 +735,12 @@ class DbAccessor final { return accessor_->DropIndex(edge_type); } + void CreateTextIndex(const std::string &index_name, storage::LabelId label) { + accessor_->CreateTextIndex(index_name, label, this); + } + + void DropTextIndex(const std::string &index_name) { accessor_->DropTextIndex(index_name); } + utils::BasicResult CreateExistenceConstraint( storage::LabelId label, storage::PropertyId property) { return accessor_->CreateExistenceConstraint(label, property); diff --git a/src/query/dump.cpp b/src/query/dump.cpp index f1dd08c8d..abc147ee8 100644 --- a/src/query/dump.cpp +++ b/src/query/dump.cpp @@ -252,6 +252,10 @@ void DumpLabelPropertyIndex(std::ostream *os, query::DbAccessor *dba, storage::L << ");"; } +void DumpTextIndex(std::ostream *os, query::DbAccessor *dba, const std::string &index_name, storage::LabelId label) { + *os << "CREATE TEXT INDEX " << EscapeName(index_name) << " ON :" << EscapeName(dba->LabelToName(label)) << ";"; +} + void DumpExistenceConstraint(std::ostream *os, query::DbAccessor *dba, storage::LabelId label, storage::PropertyId property) { *os << "CREATE CONSTRAINT ON (u:" << EscapeName(dba->LabelToName(label)) << ") ASSERT EXISTS (u." @@ -286,6 +290,8 @@ PullPlanDump::PullPlanDump(DbAccessor *dba, dbms::DatabaseAccess db_acc) CreateLabelIndicesPullChunk(), // Dump all label property indices CreateLabelPropertyIndicesPullChunk(), + // Dump all text indices + CreateTextIndicesPullChunk(), // Dump all existence constraints CreateExistenceConstraintsPullChunk(), // Dump all unique constraints @@ -412,6 +418,34 @@ PullPlanDump::PullChunk PullPlanDump::CreateLabelPropertyIndicesPullChunk() { }; } +PullPlanDump::PullChunk PullPlanDump::CreateTextIndicesPullChunk() { + // Dump all text indices + return [this, global_index = 0U](AnyStream *stream, std::optional n) mutable -> std::optional { + // Delay the construction of indices vectors + if (!indices_info_) { + indices_info_.emplace(dba_->ListAllIndices()); + } + const auto &text = indices_info_->text_indices; + + size_t local_counter = 0; + while (global_index < text.size() && (!n || local_counter < *n)) { + std::ostringstream os; + const auto &text_index = text[global_index]; + DumpTextIndex(&os, dba_, text_index.first, text_index.second); + stream->Result({TypedValue(os.str())}); + + ++global_index; + ++local_counter; + } + + if (global_index == text.size()) { + return local_counter; + } + + return std::nullopt; + }; +} + PullPlanDump::PullChunk PullPlanDump::CreateExistenceConstraintsPullChunk() { return [this, global_index = 0U](AnyStream *stream, std::optional n) mutable -> std::optional { // Delay the construction of constraint vectors diff --git a/src/query/dump.hpp b/src/query/dump.hpp index 05bd42967..0cf4a82a6 100644 --- a/src/query/dump.hpp +++ b/src/query/dump.hpp @@ -55,6 +55,7 @@ struct PullPlanDump { PullChunk CreateLabelIndicesPullChunk(); PullChunk CreateLabelPropertyIndicesPullChunk(); + PullChunk CreateTextIndicesPullChunk(); PullChunk CreateExistenceConstraintsPullChunk(); PullChunk CreateUniqueConstraintsPullChunk(); PullChunk CreateInternalIndexPullChunk(); diff --git a/src/query/exceptions.hpp b/src/query/exceptions.hpp index 147dc8710..a4c25fbae 100644 --- a/src/query/exceptions.hpp +++ b/src/query/exceptions.hpp @@ -433,4 +433,17 @@ class MultiDatabaseQueryInMulticommandTxException : public QueryException { SPECIALIZE_GET_EXCEPTION_NAME(MultiDatabaseQueryInMulticommandTxException) }; +class TextSearchException : public QueryException { + using QueryException::QueryException; + SPECIALIZE_GET_EXCEPTION_NAME(TextSearchException) +}; + +class TextSearchDisabledException : public TextSearchException { + public: + TextSearchDisabledException() + : TextSearchException( + "To use text indices and text search, start Memgraph with the experimental text search feature enabled.") {} + SPECIALIZE_GET_EXCEPTION_NAME(TextSearchDisabledException) +}; + } // namespace memgraph::query diff --git a/src/query/frontend/ast/ast.cpp b/src/query/frontend/ast/ast.cpp index 7da5c09a0..f0d09d453 100644 --- a/src/query/frontend/ast/ast.cpp +++ b/src/query/frontend/ast/ast.cpp @@ -189,6 +189,9 @@ constexpr utils::TypeInfo query::IndexQuery::kType{utils::TypeId::AST_INDEX_QUER constexpr utils::TypeInfo query::EdgeIndexQuery::kType{utils::TypeId::AST_EDGE_INDEX_QUERY, "EdgeIndexQuery", &query::Query::kType}; +constexpr utils::TypeInfo query::TextIndexQuery::kType{utils::TypeId::AST_TEXT_INDEX_QUERY, "TextIndexQuery", + &query::Query::kType}; + constexpr utils::TypeInfo query::Create::kType{utils::TypeId::AST_CREATE, "Create", &query::Clause::kType}; constexpr utils::TypeInfo query::CallProcedure::kType{utils::TypeId::AST_CALL_PROCEDURE, "CallProcedure", diff --git a/src/query/frontend/ast/ast.hpp b/src/query/frontend/ast/ast.hpp index 29f7be3cf..e3d7bc0b2 100644 --- a/src/query/frontend/ast/ast.hpp +++ b/src/query/frontend/ast/ast.hpp @@ -2273,6 +2273,37 @@ class EdgeIndexQuery : public memgraph::query::Query { friend class AstStorage; }; +class TextIndexQuery : public memgraph::query::Query { + public: + static const utils::TypeInfo kType; + const utils::TypeInfo &GetTypeInfo() const override { return kType; } + + enum class Action { CREATE, DROP }; + + TextIndexQuery() = default; + + DEFVISITABLE(QueryVisitor); + + memgraph::query::TextIndexQuery::Action action_; + memgraph::query::LabelIx label_; + std::string index_name_; + + TextIndexQuery *Clone(AstStorage *storage) const override { + TextIndexQuery *object = storage->Create(); + object->action_ = action_; + object->label_ = storage->GetLabelIx(label_.name); + object->index_name_ = index_name_; + return object; + } + + protected: + TextIndexQuery(Action action, LabelIx label, std::string index_name) + : action_(action), label_(std::move(label)), index_name_(index_name) {} + + private: + friend class AstStorage; +}; + class Create : public memgraph::query::Clause { public: static const utils::TypeInfo kType; diff --git a/src/query/frontend/ast/ast_visitor.hpp b/src/query/frontend/ast/ast_visitor.hpp index bf11878da..cc6aed138 100644 --- a/src/query/frontend/ast/ast_visitor.hpp +++ b/src/query/frontend/ast/ast_visitor.hpp @@ -83,6 +83,7 @@ class ExplainQuery; class ProfileQuery; class IndexQuery; class EdgeIndexQuery; +class TextIndexQuery; class DatabaseInfoQuery; class SystemInfoQuery; class ConstraintQuery; @@ -144,11 +145,11 @@ class ExpressionVisitor template class QueryVisitor - : public utils::Visitor {}; + : public utils::Visitor {}; } // namespace memgraph::query diff --git a/src/query/frontend/ast/cypher_main_visitor.cpp b/src/query/frontend/ast/cypher_main_visitor.cpp index 6da48c97c..35ccb3670 100644 --- a/src/query/frontend/ast/cypher_main_visitor.cpp +++ b/src/query/frontend/ast/cypher_main_visitor.cpp @@ -243,6 +243,13 @@ antlrcpp::Any CypherMainVisitor::visitIndexQuery(MemgraphCypher::IndexQueryConte return index_query; } +antlrcpp::Any CypherMainVisitor::visitTextIndexQuery(MemgraphCypher::TextIndexQueryContext *ctx) { + MG_ASSERT(ctx->children.size() == 1, "TextIndexQuery should have exactly one child!"); + auto *text_index_query = std::any_cast(ctx->children[0]->accept(this)); + query_ = text_index_query; + return text_index_query; +} + antlrcpp::Any CypherMainVisitor::visitCreateIndex(MemgraphCypher::CreateIndexContext *ctx) { auto *index_query = storage_->Create(); index_query->action_ = IndexQuery::Action::CREATE; @@ -286,6 +293,21 @@ antlrcpp::Any CypherMainVisitor::visitDropEdgeIndex(MemgraphCypher::DropEdgeInde return index_query; } +antlrcpp::Any CypherMainVisitor::visitCreateTextIndex(MemgraphCypher::CreateTextIndexContext *ctx) { + auto *index_query = storage_->Create(); + index_query->index_name_ = std::any_cast(ctx->indexName()->accept(this)); + index_query->action_ = TextIndexQuery::Action::CREATE; + index_query->label_ = AddLabel(std::any_cast(ctx->labelName()->accept(this))); + return index_query; +} + +antlrcpp::Any CypherMainVisitor::visitDropTextIndex(MemgraphCypher::DropTextIndexContext *ctx) { + auto *index_query = storage_->Create(); + index_query->index_name_ = std::any_cast(ctx->indexName()->accept(this)); + index_query->action_ = TextIndexQuery::Action::DROP; + return index_query; +} + antlrcpp::Any CypherMainVisitor::visitAuthQuery(MemgraphCypher::AuthQueryContext *ctx) { MG_ASSERT(ctx->children.size() == 1, "AuthQuery should have exactly one child!"); auto *auth_query = std::any_cast(ctx->children[0]->accept(this)); diff --git a/src/query/frontend/ast/cypher_main_visitor.hpp b/src/query/frontend/ast/cypher_main_visitor.hpp index 8c65345c8..53738af61 100644 --- a/src/query/frontend/ast/cypher_main_visitor.hpp +++ b/src/query/frontend/ast/cypher_main_visitor.hpp @@ -153,6 +153,11 @@ class CypherMainVisitor : public antlropencypher::MemgraphCypherBaseVisitor { */ antlrcpp::Any visitEdgeIndexQuery(MemgraphCypher::EdgeIndexQueryContext *ctx) override; + /** + * @return TextIndexQuery* + */ + antlrcpp::Any visitTextIndexQuery(MemgraphCypher::TextIndexQueryContext *ctx) override; + /** * @return ExplainQuery* */ @@ -500,7 +505,7 @@ class CypherMainVisitor : public antlropencypher::MemgraphCypherBaseVisitor { antlrcpp::Any visitCreateIndex(MemgraphCypher::CreateIndexContext *ctx) override; /** - * @return DropIndex* + * @return IndexQuery* */ antlrcpp::Any visitDropIndex(MemgraphCypher::DropIndexContext *ctx) override; @@ -514,6 +519,16 @@ class CypherMainVisitor : public antlropencypher::MemgraphCypherBaseVisitor { */ antlrcpp::Any visitDropEdgeIndex(MemgraphCypher::DropEdgeIndexContext *ctx) override; + /** + * @return TextIndexQuery* + */ + antlrcpp::Any visitCreateTextIndex(MemgraphCypher::CreateTextIndexContext *ctx) override; + + /** + * @return TextIndexQuery* + */ + antlrcpp::Any visitDropTextIndex(MemgraphCypher::DropTextIndexContext *ctx) override; + /** * @return AuthQuery* */ diff --git a/src/query/frontend/opencypher/grammar/Cypher.g4 b/src/query/frontend/opencypher/grammar/Cypher.g4 index 7fa218598..911615314 100644 --- a/src/query/frontend/opencypher/grammar/Cypher.g4 +++ b/src/query/frontend/opencypher/grammar/Cypher.g4 @@ -25,6 +25,7 @@ statement : query ; query : cypherQuery | indexQuery + | textIndexQuery | explainQuery | profileQuery | databaseInfoQuery @@ -65,6 +66,8 @@ cypherQuery : singleQuery ( cypherUnion )* ( queryMemoryLimit )? ; indexQuery : createIndex | dropIndex; +textIndexQuery : createTextIndex | dropTextIndex; + singleQuery : clause ( clause )* ; cypherUnion : ( UNION ALL singleQuery ) @@ -342,6 +345,12 @@ createIndex : CREATE INDEX ON ':' labelName ( '(' propertyKeyName ')' )? ; dropIndex : DROP INDEX ON ':' labelName ( '(' propertyKeyName ')' )? ; +indexName : symbolicName ; + +createTextIndex : CREATE TEXT INDEX indexName ON ':' labelName ; + +dropTextIndex : DROP TEXT INDEX indexName ; + doubleLiteral : FloatingLiteral ; cypherKeyword : ALL diff --git a/src/query/frontend/opencypher/grammar/CypherLexer.g4 b/src/query/frontend/opencypher/grammar/CypherLexer.g4 index 3e3c640d6..fb8a30b0f 100644 --- a/src/query/frontend/opencypher/grammar/CypherLexer.g4 +++ b/src/query/frontend/opencypher/grammar/CypherLexer.g4 @@ -131,6 +131,7 @@ SHOW : S H O W ; SINGLE : S I N G L E ; STARTS : S T A R T S ; STORAGE : S T O R A G E ; +TEXT : T E X T ; THEN : T H E N ; TRUE : T R U E ; UNION : U N I O N ; diff --git a/src/query/frontend/opencypher/grammar/MemgraphCypher.g4 b/src/query/frontend/opencypher/grammar/MemgraphCypher.g4 index 378310c22..ad15d6213 100644 --- a/src/query/frontend/opencypher/grammar/MemgraphCypher.g4 +++ b/src/query/frontend/opencypher/grammar/MemgraphCypher.g4 @@ -134,6 +134,7 @@ symbolicName : UnescapedSymbolicName query : cypherQuery | indexQuery | edgeIndexQuery + | textIndexQuery | explainQuery | profileQuery | databaseInfoQuery diff --git a/src/query/frontend/semantic/required_privileges.cpp b/src/query/frontend/semantic/required_privileges.cpp index 15726e3e2..d87fcb10e 100644 --- a/src/query/frontend/semantic/required_privileges.cpp +++ b/src/query/frontend/semantic/required_privileges.cpp @@ -29,6 +29,8 @@ class PrivilegeExtractor : public QueryVisitor, public HierarchicalTreeVis void Visit(EdgeIndexQuery & /*unused*/) override { AddPrivilege(AuthQuery::Privilege::INDEX); } + void Visit(TextIndexQuery & /*unused*/) override { AddPrivilege(AuthQuery::Privilege::INDEX); } + void Visit(AnalyzeGraphQuery & /*unused*/) override { AddPrivilege(AuthQuery::Privilege::INDEX); } void Visit(AuthQuery & /*unused*/) override { AddPrivilege(AuthQuery::Privilege::AUTH); } diff --git a/src/query/frontend/semantic/symbol.hpp b/src/query/frontend/semantic/symbol.hpp index 0cfb86608..1a5aa2756 100644 --- a/src/query/frontend/semantic/symbol.hpp +++ b/src/query/frontend/semantic/symbol.hpp @@ -1,4 +1,4 @@ -// Copyright 2023 Memgraph Ltd. +// Copyright 2024 Memgraph Ltd. // // Use of this software is governed by the Business Source License // included in the file licenses/BSL.txt; by using this file, you agree to be bound by the terms of the Business Source diff --git a/src/query/interpreter.cpp b/src/query/interpreter.cpp index 1322a7b99..332054485 100644 --- a/src/query/interpreter.cpp +++ b/src/query/interpreter.cpp @@ -39,6 +39,7 @@ #include "dbms/dbms_handler.hpp" #include "dbms/global.hpp" #include "dbms/inmemory/storage_helper.hpp" +#include "flags/experimental.hpp" #include "flags/replication.hpp" #include "flags/run_time_configurable.hpp" #include "glue/communication.hpp" @@ -2709,6 +2710,75 @@ PreparedQuery PrepareEdgeIndexQuery(ParsedQuery parsed_query, bool in_explicit_t RWType::W}; } +PreparedQuery PrepareTextIndexQuery(ParsedQuery parsed_query, bool in_explicit_transaction, + std::vector *notifications, CurrentDB ¤t_db) { + if (in_explicit_transaction) { + throw IndexInMulticommandTxException(); + } + + auto *text_index_query = utils::Downcast(parsed_query.query); + std::function handler; + + // TODO: we will need transaction for replication + MG_ASSERT(current_db.db_acc_, "Text index query expects a current DB"); + auto &db_acc = *current_db.db_acc_; + + MG_ASSERT(current_db.db_transactional_accessor_, "Text index query expects a current DB transaction"); + auto *dba = &*current_db.execution_db_accessor_; + + // Creating an index influences computed plan costs. + auto invalidate_plan_cache = [plan_cache = db_acc->plan_cache()] { + plan_cache->WithLock([&](auto &cache) { cache.reset(); }); + }; + + auto *storage = db_acc->storage(); + auto label = storage->NameToLabel(text_index_query->label_.name); + auto &index_name = text_index_query->index_name_; + + Notification index_notification(SeverityLevel::INFO); + switch (text_index_query->action_) { + case TextIndexQuery::Action::CREATE: { + index_notification.code = NotificationCode::CREATE_INDEX; + index_notification.title = fmt::format("Created text index on label {}.", text_index_query->label_.name); + // TODO: not just storage + invalidate_plan_cache. Need a DB transaction (for replication) + handler = [dba, label, index_name, + invalidate_plan_cache = std::move(invalidate_plan_cache)](Notification &index_notification) { + if (!flags::AreExperimentsEnabled(flags::Experiments::TEXT_SEARCH)) { + throw TextSearchDisabledException(); + } + dba->CreateTextIndex(index_name, label); + utils::OnScopeExit invalidator(invalidate_plan_cache); + }; + break; + } + case TextIndexQuery::Action::DROP: { + index_notification.code = NotificationCode::DROP_INDEX; + index_notification.title = fmt::format("Dropped text index on label {}.", text_index_query->label_.name); + // TODO: not just storage + invalidate_plan_cache. Need a DB transaction (for replication) + handler = [dba, index_name, + invalidate_plan_cache = std::move(invalidate_plan_cache)](Notification &index_notification) { + if (!flags::AreExperimentsEnabled(flags::Experiments::TEXT_SEARCH)) { + throw TextSearchDisabledException(); + } + dba->DropTextIndex(index_name); + utils::OnScopeExit invalidator(invalidate_plan_cache); + }; + break; + } + } + + return PreparedQuery{ + {}, + std::move(parsed_query.required_privileges), + [handler = std::move(handler), notifications, index_notification = std::move(index_notification)]( + AnyStream * /*stream*/, std::optional /*unused*/) mutable { + handler(index_notification); + notifications->push_back(index_notification); + return QueryHandlerResult::COMMIT; // TODO: Will need to become COMMIT when we fix replication + }, + RWType::W}; +} + PreparedQuery PrepareAuthQuery(ParsedQuery parsed_query, bool in_explicit_transaction, InterpreterContext *interpreter_context, Interpreter &interpreter) { if (in_explicit_transaction) { @@ -3499,7 +3569,7 @@ PreparedQuery PrepareDatabaseInfoQuery(ParsedQuery parsed_query, bool in_explici } MG_ASSERT(current_db.db_acc_, "Database info query expects a current DB"); - MG_ASSERT(current_db.db_transactional_accessor_, "Database ifo query expects a current DB transaction"); + MG_ASSERT(current_db.db_transactional_accessor_, "Database info query expects a current DB transaction"); auto *dba = &*current_db.execution_db_accessor_; auto *info_query = utils::Downcast(parsed_query.query); @@ -3514,10 +3584,11 @@ PreparedQuery PrepareDatabaseInfoQuery(ParsedQuery parsed_query, bool in_explici const std::string_view label_index_mark{"label"}; const std::string_view label_property_index_mark{"label+property"}; const std::string_view edge_type_index_mark{"edge-type"}; + const std::string_view text_index_mark{"text"}; auto info = dba->ListAllIndices(); auto storage_acc = database->Access(); std::vector> results; - results.reserve(info.label.size() + info.label_property.size()); + results.reserve(info.label.size() + info.label_property.size() + info.text_indices.size()); for (const auto &item : info.label) { results.push_back({TypedValue(label_index_mark), TypedValue(storage->LabelToName(item)), TypedValue(), TypedValue(static_cast(storage_acc->ApproximateVertexCount(item)))}); @@ -3532,6 +3603,10 @@ PreparedQuery PrepareDatabaseInfoQuery(ParsedQuery parsed_query, bool in_explici results.push_back({TypedValue(edge_type_index_mark), TypedValue(storage->EdgeTypeToName(item)), TypedValue(), TypedValue(static_cast(storage_acc->ApproximateEdgeCount(item)))}); } + for (const auto &[index_name, label] : info.text_indices) { + results.push_back({TypedValue(fmt::format("{} (name: {})", text_index_mark, index_name)), + TypedValue(storage->LabelToName(label)), TypedValue(), TypedValue()}); + } std::sort(results.begin(), results.end(), [&label_index_mark](const auto &record_1, const auto &record_2) { const auto type_1 = record_1[0].ValueString(); const auto type_2 = record_2[0].ValueString(); @@ -4293,13 +4368,15 @@ Interpreter::PrepareResult Interpreter::Prepare(const std::string &query_string, utils::Downcast(parsed_query.query) || utils::Downcast(parsed_query.query) || utils::Downcast(parsed_query.query) || utils::Downcast(parsed_query.query) || utils::Downcast(parsed_query.query) || utils::Downcast(parsed_query.query) || - utils::Downcast(parsed_query.query) || utils::Downcast(parsed_query.query); + utils::Downcast(parsed_query.query) || utils::Downcast(parsed_query.query) || + utils::Downcast(parsed_query.query); if (!in_explicit_transaction_ && requires_db_transaction) { // TODO: ATM only a single database, will change when we have multiple database transactions bool could_commit = utils::Downcast(parsed_query.query) != nullptr; bool unique = utils::Downcast(parsed_query.query) != nullptr || utils::Downcast(parsed_query.query) != nullptr || + utils::Downcast(parsed_query.query) != nullptr || utils::Downcast(parsed_query.query) != nullptr || upper_case_query.find(kSchemaAssert) != std::string::npos; SetupDatabaseTransaction(could_commit, unique); @@ -4337,6 +4414,9 @@ Interpreter::PrepareResult Interpreter::Prepare(const std::string &query_string, } else if (utils::Downcast(parsed_query.query)) { prepared_query = PrepareEdgeIndexQuery(std::move(parsed_query), in_explicit_transaction_, &query_execution->notifications, current_db_); + } else if (utils::Downcast(parsed_query.query)) { + prepared_query = PrepareTextIndexQuery(std::move(parsed_query), in_explicit_transaction_, + &query_execution->notifications, current_db_); } else if (utils::Downcast(parsed_query.query)) { prepared_query = PrepareAnalyzeGraphQuery(std::move(parsed_query), in_explicit_transaction_, current_db_); } else if (utils::Downcast(parsed_query.query)) { diff --git a/src/query/plan/operator.cpp b/src/query/plan/operator.cpp index 2b970cf49..ff6c1dc9a 100644 --- a/src/query/plan/operator.cpp +++ b/src/query/plan/operator.cpp @@ -32,6 +32,7 @@ #include "spdlog/spdlog.h" #include "csv/parsing.hpp" +#include "flags/experimental.hpp" #include "license/license.hpp" #include "query/auth_checker.hpp" #include "query/context.hpp" @@ -266,6 +267,10 @@ VertexAccessor &CreateLocalVertex(const NodeCreationInfo &node_info, Frame *fram } MultiPropsInitChecked(&new_node, properties); + if (flags::AreExperimentsEnabled(flags::Experiments::TEXT_SEARCH)) { + context.db_accessor->TextIndexAddVertex(new_node); + } + (*frame)[node_info.symbol] = new_node; return (*frame)[node_info.symbol].ValueVertex(); } @@ -2991,6 +2996,9 @@ bool SetProperty::SetPropertyCursor::Pull(Frame &frame, ExecutionContext &contex context.trigger_context_collector->RegisterSetObjectProperty(lhs.ValueVertex(), self_.property_, TypedValue{std::move(old_value)}, TypedValue{rhs}); } + if (flags::AreExperimentsEnabled(flags::Experiments::TEXT_SEARCH)) { + context.db_accessor->TextIndexUpdateVertex(lhs.ValueVertex()); + } break; } case TypedValue::Type::Edge: { @@ -3147,6 +3155,9 @@ void SetPropertiesOnRecord(TRecordAccessor *record, const TypedValue &rhs, SetPr case TypedValue::Type::Vertex: { PropertiesMap new_properties = get_props(rhs.ValueVertex()); update_props(new_properties); + if (flags::AreExperimentsEnabled(flags::Experiments::TEXT_SEARCH)) { + context->db_accessor->TextIndexUpdateVertex(rhs.ValueVertex()); + } break; } case TypedValue::Type::Map: { @@ -3204,6 +3215,9 @@ bool SetProperties::SetPropertiesCursor::Pull(Frame &frame, ExecutionContext &co } #endif SetPropertiesOnRecord(&lhs.ValueVertex(), rhs, self_.op_, &context, cached_name_id_); + if (flags::AreExperimentsEnabled(flags::Experiments::TEXT_SEARCH)) { + context.db_accessor->TextIndexUpdateVertex(lhs.ValueVertex()); + } break; case TypedValue::Type::Edge: #ifdef MG_ENTERPRISE @@ -3295,6 +3309,10 @@ bool SetLabels::SetLabelsCursor::Pull(Frame &frame, ExecutionContext &context) { } } + if (flags::AreExperimentsEnabled(flags::Experiments::TEXT_SEARCH)) { + context.db_accessor->TextIndexUpdateVertex(vertex); + } + return true; } @@ -3366,6 +3384,9 @@ bool RemoveProperty::RemovePropertyCursor::Pull(Frame &frame, ExecutionContext & } #endif remove_prop(&lhs.ValueVertex()); + if (flags::AreExperimentsEnabled(flags::Experiments::TEXT_SEARCH)) { + context.db_accessor->TextIndexUpdateVertex(lhs.ValueVertex()); + } break; case TypedValue::Type::Edge: #ifdef MG_ENTERPRISE @@ -3458,6 +3479,10 @@ bool RemoveLabels::RemoveLabelsCursor::Pull(Frame &frame, ExecutionContext &cont } } + if (flags::AreExperimentsEnabled(flags::Experiments::TEXT_SEARCH)) { + context.db_accessor->TextIndexUpdateVertex(vertex, EvaluateLabels(self_.labels_, evaluator, context.db_accessor)); + } + return true; } diff --git a/src/query/plan/vertex_count_cache.hpp b/src/query/plan/vertex_count_cache.hpp index 802f4e09f..69e002c0a 100644 --- a/src/query/plan/vertex_count_cache.hpp +++ b/src/query/plan/vertex_count_cache.hpp @@ -1,4 +1,4 @@ -// Copyright 2023 Memgraph Ltd. +// Copyright 2024 Memgraph Ltd. // // Use of this software is governed by the Business Source License // included in the file licenses/BSL.txt; by using this file, you agree to be bound by the terms of the Business Source diff --git a/src/query/procedure/mg_procedure_impl.cpp b/src/query/procedure/mg_procedure_impl.cpp index d6ce3c7b7..a2bc23aa3 100644 --- a/src/query/procedure/mg_procedure_impl.cpp +++ b/src/query/procedure/mg_procedure_impl.cpp @@ -23,6 +23,8 @@ #include #include +#include "flags/experimental.hpp" +#include "flags/run_time_configurable.hpp" #include "license/license.hpp" #include "mg_procedure.h" #include "module.hpp" @@ -32,6 +34,7 @@ #include "query/procedure/fmt.hpp" #include "query/procedure/mg_procedure_helpers.hpp" #include "query/stream/common.hpp" +#include "storage/v2/indices/text_index.hpp" #include "storage/v2/property_value.hpp" #include "storage/v2/storage_mode.hpp" #include "storage/v2/view.hpp" @@ -1843,6 +1846,11 @@ mgp_error mgp_vertex_set_property(struct mgp_vertex *v, const char *property_nam const auto result = std::visit( [prop_key, property_value](auto &impl) { return impl.SetProperty(prop_key, ToPropertyValue(*property_value)); }, v->impl); + if (memgraph::flags::AreExperimentsEnabled(memgraph::flags::Experiments::TEXT_SEARCH) && !result.HasError()) { + auto v_impl = v->getImpl(); + v->graph->getImpl()->TextIndexUpdateVertex(v_impl); + } + if (result.HasError()) { switch (result.GetError()) { case memgraph::storage::Error::DELETED_OBJECT: @@ -1899,6 +1907,11 @@ mgp_error mgp_vertex_set_properties(struct mgp_vertex *v, struct mgp_map *proper } const auto result = v->getImpl().UpdateProperties(props); + if (memgraph::flags::AreExperimentsEnabled(memgraph::flags::Experiments::TEXT_SEARCH) && !result.HasError()) { + auto v_impl = v->getImpl(); + v->graph->getImpl()->TextIndexUpdateVertex(v_impl); + } + if (result.HasError()) { switch (result.GetError()) { case memgraph::storage::Error::DELETED_OBJECT: @@ -1956,6 +1969,10 @@ mgp_error mgp_vertex_add_label(struct mgp_vertex *v, mgp_label label) { } const auto result = std::visit([label_id](auto &impl) { return impl.AddLabel(label_id); }, v->impl); + if (memgraph::flags::AreExperimentsEnabled(memgraph::flags::Experiments::TEXT_SEARCH) && !result.HasError()) { + auto v_impl = v->getImpl(); + v->graph->getImpl()->TextIndexUpdateVertex(v_impl); + } if (result.HasError()) { switch (result.GetError()) { @@ -1998,6 +2015,10 @@ mgp_error mgp_vertex_remove_label(struct mgp_vertex *v, mgp_label label) { throw ImmutableObjectException{"Cannot remove a label from an immutable vertex!"}; } const auto result = std::visit([label_id](auto &impl) { return impl.RemoveLabel(label_id); }, v->impl); + if (memgraph::flags::AreExperimentsEnabled(memgraph::flags::Experiments::TEXT_SEARCH) && !result.HasError()) { + auto v_impl = v->getImpl(); + v->graph->getImpl()->TextIndexUpdateVertex(v_impl, {label_id}); + } if (result.HasError()) { switch (result.GetError()) { @@ -2590,7 +2611,7 @@ mgp_error mgp_edge_iter_properties(mgp_edge *e, mgp_memory *memory, mgp_properti mgp_error mgp_graph_get_vertex_by_id(mgp_graph *graph, mgp_vertex_id id, mgp_memory *memory, mgp_vertex **result) { return WrapExceptions( [graph, id, memory]() -> mgp_vertex * { - std::optional maybe_vertex = std::visit( + auto maybe_vertex = std::visit( [graph, id](auto *impl) { return impl->FindVertex(memgraph::storage::Gid::FromInt(id.as_int), graph->view); }, @@ -2967,6 +2988,10 @@ mgp_error mgp_graph_create_vertex(struct mgp_graph *graph, mgp_memory *memory, m } auto *vertex = std::visit( [=](auto *impl) { return NewRawMgpObject(memory, impl->InsertVertex(), graph); }, graph->impl); + if (memgraph::flags::AreExperimentsEnabled(memgraph::flags::Experiments::TEXT_SEARCH)) { + auto v_impl = vertex->getImpl(); + vertex->graph->getImpl()->TextIndexAddVertex(v_impl); + } auto &ctx = graph->ctx; ctx->execution_stats[memgraph::query::ExecutionStats::Key::CREATED_NODES] += 1; @@ -3324,6 +3349,140 @@ mgp_error mgp_graph_delete_edge(struct mgp_graph *graph, mgp_edge *edge) { }); } +mgp_error mgp_graph_has_text_index(mgp_graph *graph, const char *index_name, int *result) { + return WrapExceptions([graph, index_name, result]() { + std::visit(memgraph::utils::Overloaded{ + [&](memgraph::query::DbAccessor *impl) { *result = impl->TextIndexExists(index_name); }, + [&](memgraph::query::SubgraphDbAccessor *impl) { + *result = impl->GetAccessor()->TextIndexExists(index_name); + }}, + graph->impl); + }); +} + +mgp_vertex *GetVertexByGid(mgp_graph *graph, memgraph::storage::Gid id, mgp_memory *memory) { + auto get_vertex_by_gid = memgraph::utils::Overloaded{ + [graph, id, memory](memgraph::query::DbAccessor *impl) -> mgp_vertex * { + auto maybe_vertex = impl->FindVertex(id, graph->view); + if (!maybe_vertex) return nullptr; + return NewRawMgpObject(memory, *maybe_vertex, graph); + }, + [graph, id, memory](memgraph::query::SubgraphDbAccessor *impl) -> mgp_vertex * { + auto maybe_vertex = impl->FindVertex(id, graph->view); + if (!maybe_vertex) return nullptr; + return NewRawMgpObject( + memory, memgraph::query::SubgraphVertexAccessor(*maybe_vertex, impl->getGraph()), graph); + }}; + return std::visit(get_vertex_by_gid, graph->impl); +} + +void WrapTextSearch(mgp_graph *graph, mgp_memory *memory, mgp_map **result, + const std::vector &vertex_ids = {}, + const std::optional &error_msg = std::nullopt) { + if (const auto err = mgp_map_make_empty(memory, result); err != mgp_error::MGP_ERROR_NO_ERROR) { + throw std::logic_error("Retrieving text search results failed during creation of a mgp_map"); + } + + mgp_value *error_value; + if (error_msg.has_value()) { + if (const auto err = mgp_value_make_string(error_msg.value().data(), memory, &error_value); + err != mgp_error::MGP_ERROR_NO_ERROR) { + throw std::logic_error("Retrieving text search results failed during creation of a string mgp_value"); + } + } + + mgp_list *search_results{}; + if (const auto err = mgp_list_make_empty(vertex_ids.size(), memory, &search_results); + err != mgp_error::MGP_ERROR_NO_ERROR) { + throw std::logic_error("Retrieving text search results failed during creation of a mgp_list"); + } + + for (const auto &vertex_id : vertex_ids) { + mgp_value *vertex; + if (const auto err = mgp_value_make_vertex(GetVertexByGid(graph, vertex_id, memory), &vertex); + err != mgp_error::MGP_ERROR_NO_ERROR) { + throw std::logic_error("Retrieving text search results failed during creation of a vertex mgp_value"); + } + if (const auto err = mgp_list_append(search_results, vertex); err != mgp_error::MGP_ERROR_NO_ERROR) { + throw std::logic_error( + "Retrieving text search results failed during insertion of the mgp_value into the result list"); + } + } + + mgp_value *search_results_value; + if (const auto err = mgp_value_make_list(search_results, &search_results_value); + err != mgp_error::MGP_ERROR_NO_ERROR) { + throw std::logic_error("Retrieving text search results failed during creation of a list mgp_value"); + } + + if (error_msg.has_value()) { + if (const auto err = mgp_map_insert(*result, "error_msg", error_value); err != mgp_error::MGP_ERROR_NO_ERROR) { + throw std::logic_error("Retrieving text index search error failed during insertion into mgp_map"); + } + return; + } + + if (const auto err = mgp_map_insert(*result, "search_results", search_results_value); + err != mgp_error::MGP_ERROR_NO_ERROR) { + throw std::logic_error("Retrieving text index search results failed during insertion into mgp_map"); + } +} + +void WrapTextIndexAggregation(mgp_memory *memory, mgp_map **result, const std::string &aggregation_result, + const std::optional &error_msg = std::nullopt) { + if (const auto err = mgp_map_make_empty(memory, result); err != mgp_error::MGP_ERROR_NO_ERROR) { + throw std::logic_error("Retrieving text search results failed during creation of a mgp_map"); + } + + mgp_value *aggregation_result_or_error_value; + if (const auto err = mgp_value_make_string(error_msg.value_or(aggregation_result).data(), memory, + &aggregation_result_or_error_value); + err != mgp_error::MGP_ERROR_NO_ERROR) { + throw std::logic_error("Retrieving text search results failed during creation of a string mgp_value"); + } + + if (error_msg.has_value()) { + if (const auto err = mgp_map_insert(*result, "error_msg", aggregation_result_or_error_value); + err != mgp_error::MGP_ERROR_NO_ERROR) { + throw std::logic_error("Retrieving text index aggregation error failed during insertion into mgp_map"); + } + return; + } + + if (const auto err = mgp_map_insert(*result, "aggregation_results", aggregation_result_or_error_value); + err != mgp_error::MGP_ERROR_NO_ERROR) { + throw std::logic_error("Retrieving text index aggregation results failed during insertion into mgp_map"); + } +} + +mgp_error mgp_graph_search_text_index(mgp_graph *graph, const char *index_name, const char *search_query, + text_search_mode search_mode, mgp_memory *memory, mgp_map **result) { + return WrapExceptions([graph, memory, index_name, search_query, search_mode, result]() { + std::vector found_vertices_ids; + std::optional error_msg = std::nullopt; + try { + found_vertices_ids = graph->getImpl()->TextIndexSearch(index_name, search_query, search_mode); + } catch (memgraph::query::QueryException &e) { + error_msg = e.what(); + } + WrapTextSearch(graph, memory, result, found_vertices_ids, error_msg); + }); +} + +mgp_error mgp_graph_aggregate_over_text_index(mgp_graph *graph, const char *index_name, const char *search_query, + const char *aggregation_query, mgp_memory *memory, mgp_map **result) { + return WrapExceptions([graph, memory, index_name, search_query, aggregation_query, result]() { + std::string search_results; + std::optional error_msg = std::nullopt; + try { + search_results = graph->getImpl()->TextIndexAggregate(index_name, search_query, aggregation_query); + } catch (memgraph::query::QueryException &e) { + error_msg = e.what(); + } + WrapTextIndexAggregation(memory, result, search_results, error_msg); + }); +} + #ifdef MG_ENTERPRISE namespace { void NextPermitted(mgp_vertices_iterator &it) { diff --git a/src/query/procedure/mg_procedure_impl.hpp b/src/query/procedure/mg_procedure_impl.hpp index 17cac4eca..a91b4386c 100644 --- a/src/query/procedure/mg_procedure_impl.hpp +++ b/src/query/procedure/mg_procedure_impl.hpp @@ -1,4 +1,4 @@ -// Copyright 2023 Memgraph Ltd. +// Copyright 2024 Memgraph Ltd. // // Use of this software is governed by the Business Source License // included in the file licenses/BSL.txt; by using this file, you agree to be bound by the terms of the Business Source @@ -562,6 +562,13 @@ struct mgp_graph { memgraph::query::ExecutionContext *ctx; memgraph::storage::StorageMode storage_mode; + memgraph::query::DbAccessor *getImpl() const { + return std::visit( + memgraph::utils::Overloaded{[](memgraph::query::DbAccessor *impl) { return impl; }, + [](memgraph::query::SubgraphDbAccessor *impl) { return impl->GetAccessor(); }}, + this->impl); + } + static mgp_graph WritableGraph(memgraph::query::DbAccessor &acc, memgraph::storage::View view, memgraph::query::ExecutionContext &ctx) { return mgp_graph{&acc, view, &ctx, acc.GetStorageMode()}; diff --git a/src/storage/v2/CMakeLists.txt b/src/storage/v2/CMakeLists.txt index ec5108d63..49601eb54 100644 --- a/src/storage/v2/CMakeLists.txt +++ b/src/storage/v2/CMakeLists.txt @@ -20,6 +20,7 @@ add_library(mg-storage-v2 STATIC vertex_info_cache.cpp storage.cpp indices/indices.cpp + indices/text_index.cpp all_vertices_iterable.cpp edges_iterable.cpp vertices_iterable.cpp @@ -45,4 +46,5 @@ add_library(mg-storage-v2 STATIC inmemory/replication/recovery.cpp ) -target_link_libraries(mg-storage-v2 mg::replication Threads::Threads mg-utils gflags absl::flat_hash_map mg-rpc mg-slk mg-events mg-memory) +target_include_directories(mg-storage-v2 PUBLIC ${CMAKE_SOURCE_DIR}/include) +target_link_libraries(mg-storage-v2 mg::replication Threads::Threads mg-utils mg-flags gflags absl::flat_hash_map mg-rpc mg-slk mg-events mg-memory mgcxx_text_search tantivy_text_search) diff --git a/src/storage/v2/disk/durable_metadata.cpp b/src/storage/v2/disk/durable_metadata.cpp index fe2c558ae..13d515af2 100644 --- a/src/storage/v2/disk/durable_metadata.cpp +++ b/src/storage/v2/disk/durable_metadata.cpp @@ -1,4 +1,4 @@ -// Copyright 2023 Memgraph Ltd. +// Copyright 2024 Memgraph Ltd. // // Use of this software is governed by the Business Source License // included in the file licenses/BSL.txt; by using this file, you agree to be bound by the terms of the Business Source @@ -26,6 +26,7 @@ constexpr const char *kVertexCountDescr = "vertex_count"; constexpr const char *kEdgeDountDescr = "edge_count"; constexpr const char *kLabelIndexStr = "label_index"; constexpr const char *kLabelPropertyIndexStr = "label_property_index"; +constexpr const char *kTextIndexStr = "text_index"; constexpr const char *kExistenceConstraintsStr = "existence_constraints"; constexpr const char *kUniqueConstraintsStr = "unique_constraints"; } // namespace @@ -144,6 +145,31 @@ bool DurableMetadata::PersistLabelPropertyIndexAndExistenceConstraintDeletion(La return true; } +bool DurableMetadata::PersistTextIndexCreation(const std::string &index_name, LabelId label) { + const std::string index_name_label_pair = index_name + "," + label.ToString(); + if (auto text_index_store = durability_kvstore_.Get(kTextIndexStr); text_index_store.has_value()) { + std::string &value = text_index_store.value(); + value += "|"; + value += index_name_label_pair; + return durability_kvstore_.Put(kTextIndexStr, value); + } + return durability_kvstore_.Put(kTextIndexStr, index_name_label_pair); +} + +bool DurableMetadata::PersistTextIndexDeletion(const std::string &index_name, LabelId label) { + const std::string index_name_label_pair = index_name + "," + label.ToString(); + if (auto text_index_store = durability_kvstore_.Get(kTextIndexStr); text_index_store.has_value()) { + const std::string &value = text_index_store.value(); + std::vector text_indices = utils::Split(value, "|"); + std::erase(text_indices, index_name_label_pair); + if (text_indices.empty()) { + return durability_kvstore_.Delete(kTextIndexStr); + } + return durability_kvstore_.Put(kTextIndexStr, utils::Join(text_indices, "|")); + } + return true; +} + bool DurableMetadata::PersistUniqueConstraintCreation(LabelId label, const std::set &properties) { const std::string entry = utils::GetKeyForUniqueConstraintsDurability(label, properties); diff --git a/src/storage/v2/disk/durable_metadata.hpp b/src/storage/v2/disk/durable_metadata.hpp index 168cce469..4aaa8a707 100644 --- a/src/storage/v2/disk/durable_metadata.hpp +++ b/src/storage/v2/disk/durable_metadata.hpp @@ -1,4 +1,4 @@ -// Copyright 2023 Memgraph Ltd. +// Copyright 2024 Memgraph Ltd. // // Use of this software is governed by the Business Source License // included in the file licenses/BSL.txt; by using this file, you agree to be bound by the terms of the Business Source @@ -53,6 +53,10 @@ class DurableMetadata { bool PersistLabelPropertyIndexAndExistenceConstraintDeletion(LabelId label, PropertyId property, const std::string &key); + bool PersistTextIndexCreation(const std::string &index_name, LabelId label); + + bool PersistTextIndexDeletion(const std::string &index_name, LabelId label); + bool PersistUniqueConstraintCreation(LabelId label, const std::set &properties); bool PersistUniqueConstraintDeletion(LabelId label, const std::set &properties); diff --git a/src/storage/v2/disk/storage.cpp b/src/storage/v2/disk/storage.cpp index 21fa5ecc7..4dbd248f7 100644 --- a/src/storage/v2/disk/storage.cpp +++ b/src/storage/v2/disk/storage.cpp @@ -29,6 +29,8 @@ #include #include +#include "flags/experimental.hpp" +#include "flags/run_time_configurable.hpp" #include "kvstore/kvstore.hpp" #include "spdlog/spdlog.h" #include "storage/v2/constraints/unique_constraints.hpp" @@ -856,6 +858,7 @@ StorageInfo DiskStorage::GetInfo(memgraph::replication_coordination_glue::Replic const auto &lbl = access->ListAllIndices(); info.label_indices = lbl.label.size(); info.label_property_indices = lbl.label_property.size(); + info.text_indices = lbl.text_indices.size(); const auto &con = access->ListAllConstraints(); info.existence_constraints = con.existence.size(); info.unique_constraints = con.unique.size(); @@ -1670,6 +1673,18 @@ utils::BasicResult DiskStorage::DiskAccessor::Co case MetadataDelta::Action::LABEL_PROPERTY_INDEX_STATS_CLEAR: { throw utils::NotYetImplemented("ClearIndexStats(stats) is not implemented for DiskStorage."); } break; + case MetadataDelta::Action::TEXT_INDEX_CREATE: { + const auto &info = md_delta.text_index; + if (!disk_storage->durable_metadata_.PersistTextIndexCreation(info.index_name, info.label)) { + return StorageManipulationError{PersistenceError{}}; + } + } break; + case MetadataDelta::Action::TEXT_INDEX_DROP: { + const auto &info = md_delta.text_index; + if (!disk_storage->durable_metadata_.PersistTextIndexDeletion(info.index_name, info.label)) { + return StorageManipulationError{PersistenceError{}}; + } + } break; case MetadataDelta::Action::EXISTENCE_CONSTRAINT_CREATE: { const auto &info = md_delta.label_property; if (!disk_storage->durable_metadata_.PersistLabelPropertyIndexAndExistenceConstraintCreation( @@ -1768,6 +1783,9 @@ utils::BasicResult DiskStorage::DiskAccessor::Co return StorageManipulationError{SerializationError{}}; } spdlog::trace("rocksdb: Commit successful"); + if (flags::AreExperimentsEnabled(flags::Experiments::TEXT_SEARCH)) { + disk_storage->indices_.text_index_.Commit(); + } is_transaction_active_ = false; @@ -1886,6 +1904,9 @@ void DiskStorage::DiskAccessor::Abort() { // query_plan_accumulate_aggregate.cpp transaction_.disk_transaction_->Rollback(); transaction_.disk_transaction_->ClearSnapshot(); + if (flags::AreExperimentsEnabled(flags::Experiments::TEXT_SEARCH)) { + storage_->indices_.text_index_.Rollback(); + } delete transaction_.disk_transaction_; transaction_.disk_transaction_ = nullptr; is_transaction_active_ = false; @@ -2092,7 +2113,11 @@ IndicesInfo DiskStorage::DiskAccessor::ListAllIndices() const { auto *disk_label_index = static_cast(on_disk->indices_.label_index_.get()); auto *disk_label_property_index = static_cast(on_disk->indices_.label_property_index_.get()); - return {disk_label_index->ListIndices(), disk_label_property_index->ListIndices()}; + auto &text_index = storage_->indices_.text_index_; + return {disk_label_index->ListIndices(), + disk_label_property_index->ListIndices(), + {/* edge type indices */}, + text_index.ListIndices()}; } ConstraintsInfo DiskStorage::DiskAccessor::ListAllConstraints() const { auto *disk_storage = static_cast(storage_); diff --git a/src/storage/v2/durability/durability.cpp b/src/storage/v2/durability/durability.cpp index fbbedbee5..db8bcd93b 100644 --- a/src/storage/v2/durability/durability.cpp +++ b/src/storage/v2/durability/durability.cpp @@ -151,7 +151,8 @@ void RecoverConstraints(const RecoveredIndicesAndConstraints::ConstraintsMetadat void RecoverIndicesAndStats(const RecoveredIndicesAndConstraints::IndicesMetadata &indices_metadata, Indices *indices, utils::SkipList *vertices, NameIdMapper *name_id_mapper, - const std::optional ¶llel_exec_info) { + const std::optional ¶llel_exec_info, + const std::optional &storage_dir) { spdlog::info("Recreating indices from metadata."); // Recover label indices. @@ -211,6 +212,26 @@ void RecoverIndicesAndStats(const RecoveredIndicesAndConstraints::IndicesMetadat } spdlog::info("Edge-type indices are recreated."); + if (flags::AreExperimentsEnabled(flags::Experiments::TEXT_SEARCH)) { + // Recover text indices. + spdlog::info("Recreating {} text indices from metadata.", indices_metadata.text_indices.size()); + auto &mem_text_index = indices->text_index_; + for (const auto &[index_name, label] : indices_metadata.text_indices) { + try { + if (!storage_dir.has_value()) { + throw RecoveryFailure("There must exist a storage directory in order to recover text indices!"); + } + + mem_text_index.RecoverIndex(storage_dir.value(), index_name, label, vertices->access(), name_id_mapper); + } catch (...) { + throw RecoveryFailure("The text index must be created here!"); + } + spdlog::info("Text index {} on :{} is recreated from metadata", index_name, + name_id_mapper->IdToName(label.AsUint())); + } + spdlog::info("Text indices are recreated."); + } + spdlog::info("Indices are recreated."); } @@ -331,8 +352,13 @@ std::optional Recovery::RecoverData(std::string *uuid, Replication repl_storage_state.epoch_.SetEpoch(std::move(recovered_snapshot->snapshot_info.epoch_id)); if (!utils::DirExists(wal_directory_)) { + std::optional storage_dir = std::nullopt; + if (flags::AreExperimentsEnabled(flags::Experiments::TEXT_SEARCH)) { + storage_dir = config.durability.storage_directory; + } + RecoverIndicesAndStats(indices_constraints.indices, indices, vertices, name_id_mapper, - GetParallelExecInfoIndices(recovery_info, config)); + GetParallelExecInfoIndices(recovery_info, config), storage_dir); RecoverConstraints(indices_constraints.constraints, constraints, vertices, name_id_mapper, GetParallelExecInfo(recovery_info, config)); return recovered_snapshot->recovery_info; @@ -467,8 +493,13 @@ std::optional Recovery::RecoverData(std::string *uuid, Replication spdlog::info("All necessary WAL files are loaded successfully."); } + std::optional storage_dir = std::nullopt; + if (flags::AreExperimentsEnabled(flags::Experiments::TEXT_SEARCH)) { + storage_dir = config.durability.storage_directory; + } + RecoverIndicesAndStats(indices_constraints.indices, indices, vertices, name_id_mapper, - GetParallelExecInfoIndices(recovery_info, config)); + GetParallelExecInfoIndices(recovery_info, config), storage_dir); RecoverConstraints(indices_constraints.constraints, constraints, vertices, name_id_mapper, GetParallelExecInfo(recovery_info, config)); diff --git a/src/storage/v2/durability/durability.hpp b/src/storage/v2/durability/durability.hpp index 97e2c7efc..5170b3b04 100644 --- a/src/storage/v2/durability/durability.hpp +++ b/src/storage/v2/durability/durability.hpp @@ -1,4 +1,4 @@ -// Copyright 2023 Memgraph Ltd. +// Copyright 2024 Memgraph Ltd. // // Use of this software is governed by the Business Source License // included in the file licenses/BSL.txt; by using this file, you agree to be bound by the terms of the Business Source @@ -102,7 +102,8 @@ std::optional> GetWalFiles(const std::filesystem: /// @throw RecoveryFailure void RecoverIndicesAndStats(const RecoveredIndicesAndConstraints::IndicesMetadata &indices_metadata, Indices *indices, utils::SkipList *vertices, NameIdMapper *name_id_mapper, - const std::optional ¶llel_exec_info = std::nullopt); + const std::optional ¶llel_exec_info = std::nullopt, + const std::optional &storage_dir = std::nullopt); // Helper function used to recover all discovered constraints. The // constraints must be recovered after the data recovery is done diff --git a/src/storage/v2/durability/marker.hpp b/src/storage/v2/durability/marker.hpp index ac0cc074d..18d693e51 100644 --- a/src/storage/v2/durability/marker.hpp +++ b/src/storage/v2/durability/marker.hpp @@ -64,6 +64,8 @@ enum class Marker : uint8_t { DELTA_LABEL_PROPERTY_INDEX_STATS_CLEAR = 0x64, DELTA_EDGE_TYPE_INDEX_CREATE = 0x65, DELTA_EDGE_TYPE_INDEX_DROP = 0x66, + DELTA_TEXT_INDEX_CREATE = 0x67, + DELTA_TEXT_INDEX_DROP = 0x68, VALUE_FALSE = 0x00, VALUE_TRUE = 0xff, @@ -110,6 +112,8 @@ static const Marker kMarkersAll[] = { Marker::DELTA_LABEL_PROPERTY_INDEX_DROP, Marker::DELTA_EDGE_TYPE_INDEX_CREATE, Marker::DELTA_EDGE_TYPE_INDEX_DROP, + Marker::DELTA_TEXT_INDEX_CREATE, + Marker::DELTA_TEXT_INDEX_DROP, Marker::DELTA_EXISTENCE_CONSTRAINT_CREATE, Marker::DELTA_EXISTENCE_CONSTRAINT_DROP, Marker::DELTA_UNIQUE_CONSTRAINT_CREATE, diff --git a/src/storage/v2/durability/metadata.hpp b/src/storage/v2/durability/metadata.hpp index c8ee27b2f..f36fc068d 100644 --- a/src/storage/v2/durability/metadata.hpp +++ b/src/storage/v2/durability/metadata.hpp @@ -44,6 +44,7 @@ struct RecoveredIndicesAndConstraints { std::vector> label_stats; std::vector>> label_property_stats; std::vector edge; + std::vector> text_indices; } indices; struct ConstraintsMetadata { diff --git a/src/storage/v2/durability/serialization.cpp b/src/storage/v2/durability/serialization.cpp index 28ba64943..becfa7f34 100644 --- a/src/storage/v2/durability/serialization.cpp +++ b/src/storage/v2/durability/serialization.cpp @@ -353,6 +353,8 @@ std::optional Decoder::ReadPropertyValue() { case Marker::DELTA_LABEL_PROPERTY_INDEX_DROP: case Marker::DELTA_EDGE_TYPE_INDEX_CREATE: case Marker::DELTA_EDGE_TYPE_INDEX_DROP: + case Marker::DELTA_TEXT_INDEX_CREATE: + case Marker::DELTA_TEXT_INDEX_DROP: case Marker::DELTA_EXISTENCE_CONSTRAINT_CREATE: case Marker::DELTA_EXISTENCE_CONSTRAINT_DROP: case Marker::DELTA_UNIQUE_CONSTRAINT_CREATE: @@ -459,6 +461,8 @@ bool Decoder::SkipPropertyValue() { case Marker::DELTA_LABEL_PROPERTY_INDEX_DROP: case Marker::DELTA_EDGE_TYPE_INDEX_CREATE: case Marker::DELTA_EDGE_TYPE_INDEX_DROP: + case Marker::DELTA_TEXT_INDEX_CREATE: + case Marker::DELTA_TEXT_INDEX_DROP: case Marker::DELTA_EXISTENCE_CONSTRAINT_CREATE: case Marker::DELTA_EXISTENCE_CONSTRAINT_DROP: case Marker::DELTA_UNIQUE_CONSTRAINT_CREATE: diff --git a/src/storage/v2/durability/snapshot.cpp b/src/storage/v2/durability/snapshot.cpp index 5fea3dfa5..0e3bb96e3 100644 --- a/src/storage/v2/durability/snapshot.cpp +++ b/src/storage/v2/durability/snapshot.cpp @@ -13,6 +13,8 @@ #include +#include "flags/experimental.hpp" +#include "flags/run_time_configurable.hpp" #include "spdlog/spdlog.h" #include "storage/v2/durability/exceptions.hpp" #include "storage/v2/durability/paths.hpp" @@ -2004,6 +2006,24 @@ RecoveredSnapshot LoadSnapshot(const std::filesystem::path &path, utils::SkipLis spdlog::info("Metadata of edge-type indices are recovered."); } + // Recover text indices. + if (flags::AreExperimentsEnabled(flags::Experiments::TEXT_SEARCH)) { + auto size = snapshot.ReadUint(); + if (!size) throw RecoveryFailure("Couldn't recover the number of text indices!"); + spdlog::info("Recovering metadata of {} text indices.", *size); + for (uint64_t i = 0; i < *size; ++i) { + auto index_name = snapshot.ReadString(); + if (!index_name.has_value()) throw RecoveryFailure("Couldn't read text index name!"); + auto label = snapshot.ReadUint(); + if (!label) throw RecoveryFailure("Couldn't read text index label!"); + AddRecoveredIndexConstraint(&indices_constraints.indices.text_indices, + {index_name.value(), get_label_from_id(*label)}, "The text index already exists!"); + SPDLOG_TRACE("Recovered metadata of text index {} for :{}", index_name.value(), + name_id_mapper->IdToName(snapshot_id_map.at(*label))); + } + spdlog::info("Metadata of text indices are recovered."); + } + spdlog::info("Metadata of indices are recovered."); } @@ -2493,6 +2513,16 @@ void CreateSnapshot(Storage *storage, Transaction *transaction, const std::files write_mapping(item); } } + + // Write text indices. + if (flags::AreExperimentsEnabled(flags::Experiments::TEXT_SEARCH)) { + auto text_indices = storage->indices_.text_index_.ListIndices(); + snapshot.WriteUint(text_indices.size()); + for (const auto &[index_name, label] : text_indices) { + snapshot.WriteString(index_name); + write_mapping(label); + } + } } // Write constraints. diff --git a/src/storage/v2/durability/storage_global_operation.hpp b/src/storage/v2/durability/storage_global_operation.hpp index 7dd635e9d..d9c77b6c6 100644 --- a/src/storage/v2/durability/storage_global_operation.hpp +++ b/src/storage/v2/durability/storage_global_operation.hpp @@ -25,6 +25,8 @@ enum class StorageMetadataOperation { LABEL_PROPERTY_INDEX_STATS_CLEAR, EDGE_TYPE_INDEX_CREATE, EDGE_TYPE_INDEX_DROP, + TEXT_INDEX_CREATE, + TEXT_INDEX_DROP, EXISTENCE_CONSTRAINT_CREATE, EXISTENCE_CONSTRAINT_DROP, UNIQUE_CONSTRAINT_CREATE, diff --git a/src/storage/v2/durability/wal.cpp b/src/storage/v2/durability/wal.cpp index 5c40ab1c5..c684d818c 100644 --- a/src/storage/v2/durability/wal.cpp +++ b/src/storage/v2/durability/wal.cpp @@ -99,6 +99,10 @@ Marker OperationToMarker(StorageMetadataOperation operation) { return Marker::DELTA_EDGE_TYPE_INDEX_CREATE; case StorageMetadataOperation::EDGE_TYPE_INDEX_DROP: return Marker::DELTA_EDGE_TYPE_INDEX_DROP; + case StorageMetadataOperation::TEXT_INDEX_CREATE: + return Marker::DELTA_TEXT_INDEX_CREATE; + case StorageMetadataOperation::TEXT_INDEX_DROP: + return Marker::DELTA_TEXT_INDEX_DROP; case StorageMetadataOperation::EXISTENCE_CONSTRAINT_CREATE: return Marker::DELTA_EXISTENCE_CONSTRAINT_CREATE; case StorageMetadataOperation::EXISTENCE_CONSTRAINT_DROP: @@ -172,6 +176,10 @@ WalDeltaData::Type MarkerToWalDeltaDataType(Marker marker) { return WalDeltaData::Type::LABEL_PROPERTY_INDEX_CREATE; case Marker::DELTA_LABEL_PROPERTY_INDEX_DROP: return WalDeltaData::Type::LABEL_PROPERTY_INDEX_DROP; + case Marker::DELTA_TEXT_INDEX_CREATE: + return WalDeltaData::Type::TEXT_INDEX_CREATE; + case Marker::DELTA_TEXT_INDEX_DROP: + return WalDeltaData::Type::TEXT_INDEX_DROP; case Marker::DELTA_LABEL_PROPERTY_INDEX_STATS_SET: return WalDeltaData::Type::LABEL_PROPERTY_INDEX_STATS_SET; case Marker::DELTA_LABEL_PROPERTY_INDEX_STATS_CLEAR: @@ -382,6 +390,21 @@ WalDeltaData ReadSkipWalDeltaData(BaseDecoder *decoder) { if (!decoder->SkipString()) throw RecoveryFailure("Invalid WAL data!"); } } + break; + } + case WalDeltaData::Type::TEXT_INDEX_CREATE: + case WalDeltaData::Type::TEXT_INDEX_DROP: { + if constexpr (read_data) { + auto index_name = decoder->ReadString(); + if (!index_name) throw RecoveryFailure("Invalid WAL data!"); + delta.operation_text.index_name = std::move(*index_name); + auto label = decoder->ReadString(); + if (!label) throw RecoveryFailure("Invalid WAL data!"); + delta.operation_text.label = std::move(*label); + } else { + if (!decoder->SkipString() || !decoder->SkipString()) throw RecoveryFailure("Invalid WAL data!"); + } + break; } } @@ -529,6 +552,12 @@ bool operator==(const WalDeltaData &a, const WalDeltaData &b) { case WalDeltaData::Type::LABEL_PROPERTY_INDEX_CREATE: case WalDeltaData::Type::LABEL_PROPERTY_INDEX_DROP: + case WalDeltaData::Type::TEXT_INDEX_CREATE: + return a.operation_text.index_name == b.operation_text.index_name && + a.operation_text.label == b.operation_text.label; + case WalDeltaData::Type::TEXT_INDEX_DROP: + return a.operation_text.index_name == b.operation_text.index_name && + a.operation_text.label == b.operation_text.label; case WalDeltaData::Type::EXISTENCE_CONSTRAINT_CREATE: case WalDeltaData::Type::EXISTENCE_CONSTRAINT_DROP: return a.operation_label_property.label == b.operation_label_property.label && @@ -675,7 +704,8 @@ void EncodeTransactionEnd(BaseEncoder *encoder, uint64_t timestamp) { } void EncodeOperation(BaseEncoder *encoder, NameIdMapper *name_id_mapper, StorageMetadataOperation operation, - LabelId label, const std::set &properties, const LabelIndexStats &stats, + const std::optional text_index_name, LabelId label, + const std::set &properties, const LabelIndexStats &stats, const LabelPropertyIndexStats &property_stats, uint64_t timestamp) { encoder->WriteMarker(Marker::SECTION_DELTA); encoder->WriteUint(timestamp); @@ -731,6 +761,14 @@ void EncodeOperation(BaseEncoder *encoder, NameIdMapper *name_id_mapper, Storage case StorageMetadataOperation::EDGE_TYPE_INDEX_DROP: { MG_ASSERT(false, "Invalid function call!"); } + case StorageMetadataOperation::TEXT_INDEX_CREATE: + case StorageMetadataOperation::TEXT_INDEX_DROP: { + MG_ASSERT(text_index_name.has_value(), "Text indices must be named!"); + encoder->WriteMarker(OperationToMarker(operation)); + encoder->WriteString(text_index_name.value()); + encoder->WriteString(name_id_mapper->IdToName(label.AsUint())); + break; + } } } @@ -752,6 +790,8 @@ void EncodeOperation(BaseEncoder *encoder, NameIdMapper *name_id_mapper, Storage case StorageMetadataOperation::LABEL_INDEX_STATS_SET: case StorageMetadataOperation::LABEL_PROPERTY_INDEX_CREATE: case StorageMetadataOperation::LABEL_PROPERTY_INDEX_DROP: + case StorageMetadataOperation::TEXT_INDEX_CREATE: + case StorageMetadataOperation::TEXT_INDEX_DROP: case StorageMetadataOperation::EXISTENCE_CONSTRAINT_CREATE: case StorageMetadataOperation::EXISTENCE_CONSTRAINT_DROP: case StorageMetadataOperation::LABEL_PROPERTY_INDEX_STATS_SET: @@ -1000,6 +1040,20 @@ RecoveryInfo LoadWal(const std::filesystem::path &path, RecoveredIndicesAndConst "The label index stats doesn't exist!"); break; } + case WalDeltaData::Type::TEXT_INDEX_CREATE: { + auto index_name = delta.operation_text.index_name; + auto label = LabelId::FromUint(name_id_mapper->NameToId(delta.operation_text.label)); + AddRecoveredIndexConstraint(&indices_constraints->indices.text_indices, {index_name, label}, + "The text index already exists!"); + break; + } + case WalDeltaData::Type::TEXT_INDEX_DROP: { + auto index_name = delta.operation_text.index_name; + auto label = LabelId::FromUint(name_id_mapper->NameToId(delta.operation_text.label)); + RemoveRecoveredIndexConstraint(&indices_constraints->indices.text_indices, {index_name, label}, + "The text index doesn't exist!"); + break; + } case WalDeltaData::Type::EXISTENCE_CONSTRAINT_CREATE: { auto label_id = LabelId::FromUint(name_id_mapper->NameToId(delta.operation_label_property.label)); auto property_id = PropertyId::FromUint(name_id_mapper->NameToId(delta.operation_label_property.property)); @@ -1148,10 +1202,11 @@ void WalFile::AppendTransactionEnd(uint64_t timestamp) { UpdateStats(timestamp); } -void WalFile::AppendOperation(StorageMetadataOperation operation, LabelId label, const std::set &properties, - const LabelIndexStats &stats, const LabelPropertyIndexStats &property_stats, - uint64_t timestamp) { - EncodeOperation(&wal_, name_id_mapper_, operation, label, properties, stats, property_stats, timestamp); +void WalFile::AppendOperation(StorageMetadataOperation operation, const std::optional text_index_name, + LabelId label, const std::set &properties, const LabelIndexStats &stats, + const LabelPropertyIndexStats &property_stats, uint64_t timestamp) { + EncodeOperation(&wal_, name_id_mapper_, operation, text_index_name, label, properties, stats, property_stats, + timestamp); UpdateStats(timestamp); } diff --git a/src/storage/v2/durability/wal.hpp b/src/storage/v2/durability/wal.hpp index 516487e0d..4990e6979 100644 --- a/src/storage/v2/durability/wal.hpp +++ b/src/storage/v2/durability/wal.hpp @@ -69,6 +69,8 @@ struct WalDeltaData { LABEL_PROPERTY_INDEX_STATS_CLEAR, EDGE_INDEX_CREATE, EDGE_INDEX_DROP, + TEXT_INDEX_CREATE, + TEXT_INDEX_DROP, EXISTENCE_CONSTRAINT_CREATE, EXISTENCE_CONSTRAINT_DROP, UNIQUE_CONSTRAINT_CREATE, @@ -127,6 +129,11 @@ struct WalDeltaData { std::string property; std::string stats; } operation_label_property_stats; + + struct { + std::string index_name; + std::string label; + } operation_text; }; bool operator==(const WalDeltaData &a, const WalDeltaData &b); @@ -163,6 +170,8 @@ constexpr bool IsWalDeltaDataTypeTransactionEndVersion15(const WalDeltaData::Typ case WalDeltaData::Type::LABEL_PROPERTY_INDEX_STATS_CLEAR: case WalDeltaData::Type::EDGE_INDEX_CREATE: case WalDeltaData::Type::EDGE_INDEX_DROP: + case WalDeltaData::Type::TEXT_INDEX_CREATE: + case WalDeltaData::Type::TEXT_INDEX_DROP: case WalDeltaData::Type::EXISTENCE_CONSTRAINT_CREATE: case WalDeltaData::Type::EXISTENCE_CONSTRAINT_DROP: case WalDeltaData::Type::UNIQUE_CONSTRAINT_CREATE: @@ -213,7 +222,8 @@ void EncodeTransactionEnd(BaseEncoder *encoder, uint64_t timestamp); /// Function used to encode non-transactional operation. void EncodeOperation(BaseEncoder *encoder, NameIdMapper *name_id_mapper, StorageMetadataOperation operation, - LabelId label, const std::set &properties, const LabelIndexStats &stats, + const std::optional text_index_name, LabelId label, + const std::set &properties, const LabelIndexStats &stats, const LabelPropertyIndexStats &property_stats, uint64_t timestamp); void EncodeOperation(BaseEncoder *encoder, NameIdMapper *name_id_mapper, StorageMetadataOperation operation, @@ -248,8 +258,9 @@ class WalFile { void AppendTransactionEnd(uint64_t timestamp); - void AppendOperation(StorageMetadataOperation operation, LabelId label, const std::set &properties, - const LabelIndexStats &stats, const LabelPropertyIndexStats &property_stats, uint64_t timestamp); + void AppendOperation(StorageMetadataOperation operation, const std::optional text_index_name, + LabelId label, const std::set &properties, const LabelIndexStats &stats, + const LabelPropertyIndexStats &property_stats, uint64_t timestamp); void AppendOperation(StorageMetadataOperation operation, EdgeTypeId edge_type, uint64_t timestamp); diff --git a/src/storage/v2/indices/indices.cpp b/src/storage/v2/indices/indices.cpp index 6068f888f..1cbde2362 100644 --- a/src/storage/v2/indices/indices.cpp +++ b/src/storage/v2/indices/indices.cpp @@ -16,6 +16,7 @@ #include "storage/v2/inmemory/edge_type_index.hpp" #include "storage/v2/inmemory/label_index.hpp" #include "storage/v2/inmemory/label_property_index.hpp" +#include "storage/v2/storage.hpp" namespace memgraph::storage { diff --git a/src/storage/v2/indices/indices.hpp b/src/storage/v2/indices/indices.hpp index 40cff577f..6f1bc44db 100644 --- a/src/storage/v2/indices/indices.hpp +++ b/src/storage/v2/indices/indices.hpp @@ -18,6 +18,7 @@ #include "storage/v2/indices/edge_type_index.hpp" #include "storage/v2/indices/label_index.hpp" #include "storage/v2/indices/label_property_index.hpp" +#include "storage/v2/indices/text_index.hpp" #include "storage/v2/storage_mode.hpp" namespace memgraph::storage { @@ -31,12 +32,12 @@ struct Indices { Indices &operator=(Indices &&) = delete; ~Indices() = default; - /// This function should be called from garbage collection to clean-up the + /// This function should be called from garbage collection to clean up the /// index. /// TODO: unused in disk indices void RemoveObsoleteEntries(uint64_t oldest_active_start_timestamp, std::stop_token token) const; - /// Surgical removal of entries that was inserted this transaction + /// Surgical removal of entries that were inserted in this transaction /// TODO: unused in disk indices void AbortEntries(LabelId labelId, std::span vertices, uint64_t exact_start_timestamp) const; void AbortEntries(PropertyId property, std::span const> vertices, @@ -71,6 +72,7 @@ struct Indices { std::unique_ptr label_index_; std::unique_ptr label_property_index_; std::unique_ptr edge_type_index_; + mutable TextIndex text_index_; }; } // namespace memgraph::storage diff --git a/src/storage/v2/indices/text_index.cpp b/src/storage/v2/indices/text_index.cpp new file mode 100644 index 000000000..1c9488097 --- /dev/null +++ b/src/storage/v2/indices/text_index.cpp @@ -0,0 +1,430 @@ +// Copyright 2024 Memgraph Ltd. +// +// Use of this software is governed by the Business Source License +// included in the file licenses/BSL.txt; by using this file, you agree to be bound by the terms of the Business Source +// License, and you may not use this file except in compliance with the Business Source License. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0, included in the file +// licenses/APL.txt. + +#include "storage/v2/indices/text_index.hpp" +#include "flags/experimental.hpp" +#include "flags/run_time_configurable.hpp" +#include "query/db_accessor.hpp" +#include "storage/v2/view.hpp" +#include "text_search.hpp" + +namespace memgraph::storage { + +std::string GetPropertyName(PropertyId prop_id, memgraph::query::DbAccessor *db) { return db->PropertyToName(prop_id); } + +std::string GetPropertyName(PropertyId prop_id, NameIdMapper *name_id_mapper) { + return name_id_mapper->IdToName(prop_id.AsUint()); +} + +inline std::string TextIndex::MakeIndexPath(const std::filesystem::path &storage_dir, const std::string &index_name) { + return (storage_dir / kTextIndicesDirectory / index_name).string(); +} + +void TextIndex::CreateEmptyIndex(const std::filesystem::path &storage_dir, const std::string &index_name, + LabelId label) { + if (!flags::AreExperimentsEnabled(flags::Experiments::TEXT_SEARCH)) { + throw query::TextSearchDisabledException(); + } + + if (index_.contains(index_name)) { + throw query::TextSearchException("Text index \"{}\" already exists.", index_name); + } + + try { + nlohmann::json mappings = {}; + mappings["properties"] = {}; + mappings["properties"]["metadata"] = {{"type", "json"}, {"fast", true}, {"stored", true}, {"text", true}}; + mappings["properties"]["data"] = {{"type", "json"}, {"fast", true}, {"stored", true}, {"text", true}}; + mappings["properties"]["all"] = {{"type", "text"}, {"fast", true}, {"stored", true}, {"text", true}}; + + index_.emplace(index_name, TextIndexData{.context_ = mgcxx::text_search::create_index( + MakeIndexPath(storage_dir, index_name), + mgcxx::text_search::IndexConfig{.mappings = mappings.dump()}), + .scope_ = label}); + } catch (const std::exception &e) { + throw query::TextSearchException("Tantivy error: {}", e.what()); + } + label_to_index_.emplace(label, index_name); +} + +template +nlohmann::json TextIndex::SerializeProperties(const std::map &properties, T *name_resolver) { + nlohmann::json serialized_properties = nlohmann::json::value_t::object; + for (const auto &[prop_id, prop_value] : properties) { + switch (prop_value.type()) { + case PropertyValue::Type::Bool: + serialized_properties[GetPropertyName(prop_id, name_resolver)] = prop_value.ValueBool(); + break; + case PropertyValue::Type::Int: + serialized_properties[GetPropertyName(prop_id, name_resolver)] = prop_value.ValueInt(); + break; + case PropertyValue::Type::Double: + serialized_properties[GetPropertyName(prop_id, name_resolver)] = prop_value.ValueDouble(); + break; + case PropertyValue::Type::String: + serialized_properties[GetPropertyName(prop_id, name_resolver)] = prop_value.ValueString(); + break; + case PropertyValue::Type::Null: + case PropertyValue::Type::List: + case PropertyValue::Type::Map: + case PropertyValue::Type::TemporalData: + default: + continue; + } + } + + return serialized_properties; +} + +std::string TextIndex::StringifyProperties(const std::map &properties) { + std::vector indexable_properties_as_string; + for (const auto &[_, prop_value] : properties) { + switch (prop_value.type()) { + case PropertyValue::Type::Bool: + indexable_properties_as_string.push_back(prop_value.ValueBool() ? "true" : "false"); + break; + case PropertyValue::Type::Int: + indexable_properties_as_string.push_back(std::to_string(prop_value.ValueInt())); + break; + case PropertyValue::Type::Double: + indexable_properties_as_string.push_back(std::to_string(prop_value.ValueDouble())); + break; + case PropertyValue::Type::String: + indexable_properties_as_string.push_back(prop_value.ValueString()); + break; + // NOTE: As the following types aren‘t indexed in Tantivy, they don’t appear in the property value string either. + case PropertyValue::Type::Null: + case PropertyValue::Type::List: + case PropertyValue::Type::Map: + case PropertyValue::Type::TemporalData: + default: + continue; + } + } + return utils::Join(indexable_properties_as_string, " "); +} + +std::vector TextIndex::GetApplicableTextIndices(const std::vector &labels) { + if (!flags::AreExperimentsEnabled(flags::Experiments::TEXT_SEARCH)) { + throw query::TextSearchDisabledException(); + } + + std::vector applicable_text_indices; + for (const auto &label : labels) { + if (label_to_index_.contains(label)) { + applicable_text_indices.push_back(&index_.at(label_to_index_.at(label)).context_); + } + } + return applicable_text_indices; +} + +void TextIndex::LoadNodeToTextIndices(const std::int64_t gid, const nlohmann::json &properties, + const std::string &property_values_as_str, + const std::vector &applicable_text_indices) { + if (applicable_text_indices.empty()) { + return; + } + + // NOTE: Text indexes are presently all-property indices. If we allow text indexes restricted to specific properties, + // an indexable document should be created for each applicable index. + nlohmann::json document = {}; + document["data"] = properties; + document["all"] = property_values_as_str; + document["metadata"] = {}; + document["metadata"]["gid"] = gid; + document["metadata"]["deleted"] = false; + document["metadata"]["is_node"] = true; + + for (auto *index_context : applicable_text_indices) { + try { + mgcxx::text_search::add_document( + *index_context, + mgcxx::text_search::DocumentInput{ + .data = document.dump(-1, ' ', false, nlohmann::json::error_handler_t::replace)}, + kDoSkipCommit); + } catch (const std::exception &e) { + throw query::TextSearchException("Tantivy error: {}", e.what()); + } + } +} + +void TextIndex::CommitLoadedNodes(mgcxx::text_search::Context &index_context) { + // As CREATE TEXT INDEX (...) queries don’t accumulate deltas, db_transactional_accessor_->Commit() does not reach + // the code area where changes to indices are committed. To get around that without needing to commit text indices + // after every such query, we commit here. + try { + mgcxx::text_search::commit(index_context); + } catch (const std::exception &e) { + throw query::TextSearchException("Tantivy error: {}", e.what()); + } +} + +void TextIndex::AddNode( + Vertex *vertex_after_update, NameIdMapper *name_id_mapper, + const std::optional> &maybe_applicable_text_indices) { + if (!flags::AreExperimentsEnabled(flags::Experiments::TEXT_SEARCH)) { + throw query::TextSearchDisabledException(); + } + + auto applicable_text_indices = + maybe_applicable_text_indices.value_or(GetApplicableTextIndices(vertex_after_update->labels)); + if (applicable_text_indices.empty()) { + return; + } + + auto vertex_properties = vertex_after_update->properties.Properties(); + LoadNodeToTextIndices(vertex_after_update->gid.AsInt(), SerializeProperties(vertex_properties, name_id_mapper), + StringifyProperties(vertex_properties), applicable_text_indices); +} + +void TextIndex::UpdateNode(Vertex *vertex_after_update, NameIdMapper *name_id_mapper, + const std::vector &removed_labels) { + if (!flags::AreExperimentsEnabled(flags::Experiments::TEXT_SEARCH)) { + throw query::TextSearchDisabledException(); + } + + if (!removed_labels.empty()) { + auto indexes_to_remove_node_from = GetApplicableTextIndices(removed_labels); + RemoveNode(vertex_after_update, indexes_to_remove_node_from); + } + + auto applicable_text_indices = GetApplicableTextIndices(vertex_after_update->labels); + if (applicable_text_indices.empty()) return; + RemoveNode(vertex_after_update, applicable_text_indices); + AddNode(vertex_after_update, name_id_mapper, applicable_text_indices); +} + +void TextIndex::RemoveNode( + Vertex *vertex_after_update, + const std::optional> &maybe_applicable_text_indices) { + if (!flags::AreExperimentsEnabled(flags::Experiments::TEXT_SEARCH)) { + throw query::TextSearchDisabledException(); + } + + auto search_node_to_be_deleted = + mgcxx::text_search::SearchInput{.search_query = fmt::format("metadata.gid:{}", vertex_after_update->gid.AsInt())}; + + for (auto *index_context : + maybe_applicable_text_indices.value_or(GetApplicableTextIndices(vertex_after_update->labels))) { + try { + mgcxx::text_search::delete_document(*index_context, search_node_to_be_deleted, kDoSkipCommit); + } catch (const std::exception &e) { + throw query::TextSearchException("Tantivy error: {}", e.what()); + } + } +} + +void TextIndex::CreateIndex(const std::filesystem::path &storage_dir, const std::string &index_name, LabelId label, + memgraph::query::DbAccessor *db) { + if (!flags::AreExperimentsEnabled(flags::Experiments::TEXT_SEARCH)) { + throw query::TextSearchDisabledException(); + } + + CreateEmptyIndex(storage_dir, index_name, label); + + for (const auto &v : db->Vertices(View::NEW)) { + if (!v.HasLabel(View::NEW, label).GetValue()) { + continue; + } + + auto vertex_properties = v.Properties(View::NEW).GetValue(); + LoadNodeToTextIndices(v.Gid().AsInt(), SerializeProperties(vertex_properties, db), + StringifyProperties(vertex_properties), {&index_.at(index_name).context_}); + } + + CommitLoadedNodes(index_.at(index_name).context_); +} + +void TextIndex::RecoverIndex(const std::filesystem::path &storage_dir, const std::string &index_name, LabelId label, + memgraph::utils::SkipList::Accessor vertices, NameIdMapper *name_id_mapper) { + if (!flags::AreExperimentsEnabled(flags::Experiments::TEXT_SEARCH)) { + throw query::TextSearchDisabledException(); + } + + // Clear Tantivy-internal files if they exist from previous sessions + std::filesystem::remove_all(storage_dir / kTextIndicesDirectory / index_name); + + CreateEmptyIndex(storage_dir, index_name, label); + + for (const auto &v : vertices) { + if (std::find(v.labels.begin(), v.labels.end(), label) == v.labels.end()) { + continue; + } + + auto vertex_properties = v.properties.Properties(); + LoadNodeToTextIndices(v.gid.AsInt(), SerializeProperties(vertex_properties, name_id_mapper), + StringifyProperties(vertex_properties), {&index_.at(index_name).context_}); + } + + CommitLoadedNodes(index_.at(index_name).context_); +} + +LabelId TextIndex::DropIndex(const std::filesystem::path &storage_dir, const std::string &index_name) { + if (!flags::AreExperimentsEnabled(flags::Experiments::TEXT_SEARCH)) { + throw query::TextSearchDisabledException(); + } + + if (!index_.contains(index_name)) { + throw query::TextSearchException("Text index \"{}\" doesn’t exist.", index_name); + } + + try { + mgcxx::text_search::drop_index(MakeIndexPath(storage_dir, index_name)); + } catch (const std::exception &e) { + throw query::TextSearchException("Tantivy error: {}", e.what()); + } + auto deleted_index_label = index_.at(index_name).scope_; + + index_.erase(index_name); + std::erase_if(label_to_index_, [index_name](const auto &item) { return item.second == index_name; }); + + return deleted_index_label; +} + +bool TextIndex::IndexExists(const std::string &index_name) const { return index_.contains(index_name); } + +mgcxx::text_search::SearchOutput TextIndex::SearchGivenProperties(const std::string &index_name, + const std::string &search_query) { + try { + return mgcxx::text_search::search( + index_.at(index_name).context_, + mgcxx::text_search::SearchInput{.search_query = search_query, .return_fields = {"metadata"}}); + } catch (const std::exception &e) { + throw query::TextSearchException("Tantivy error: {}", e.what()); + } + + return mgcxx::text_search::SearchOutput{}; +} + +mgcxx::text_search::SearchOutput TextIndex::RegexSearch(const std::string &index_name, + const std::string &search_query) { + try { + return mgcxx::text_search::regex_search( + index_.at(index_name).context_, + mgcxx::text_search::SearchInput{ + .search_fields = {"all"}, .search_query = search_query, .return_fields = {"metadata"}}); + } catch (const std::exception &e) { + throw query::TextSearchException("Tantivy error: {}", e.what()); + } + + return mgcxx::text_search::SearchOutput{}; +} + +mgcxx::text_search::SearchOutput TextIndex::SearchAllProperties(const std::string &index_name, + const std::string &search_query) { + try { + return mgcxx::text_search::search( + index_.at(index_name).context_, + mgcxx::text_search::SearchInput{ + .search_fields = {"all"}, .search_query = search_query, .return_fields = {"metadata"}}); + } catch (const std::exception &e) { + throw query::TextSearchException("Tantivy error: {}", e.what()); + } + + return mgcxx::text_search::SearchOutput{}; +} + +std::vector TextIndex::Search(const std::string &index_name, const std::string &search_query, + text_search_mode search_mode) { + if (!flags::AreExperimentsEnabled(flags::Experiments::TEXT_SEARCH)) { + throw query::TextSearchDisabledException(); + } + + if (!index_.contains(index_name)) { + throw query::TextSearchException("Text index \"{}\" doesn’t exist.", index_name); + } + + mgcxx::text_search::SearchOutput search_results; + switch (search_mode) { + case text_search_mode::SPECIFIED_PROPERTIES: + search_results = SearchGivenProperties(index_name, search_query); + break; + case text_search_mode::REGEX: + search_results = RegexSearch(index_name, search_query); + break; + case text_search_mode::ALL_PROPERTIES: + search_results = SearchAllProperties(index_name, search_query); + break; + default: + throw query::TextSearchException( + "Unsupported search mode: please use one of text_search.search, text_search.search_all, or " + "text_search.regex_search."); + } + + std::vector found_nodes; + for (const auto &doc : search_results.docs) { + // The CXX .data() method (https://cxx.rs/binding/string.html) may overestimate string length, causing JSON parsing + // errors downstream. We prevent this by resizing the converted string with the correctly-working .length() method. + std::string doc_string = doc.data.data(); + doc_string.resize(doc.data.length()); + auto doc_json = nlohmann::json::parse(doc_string); + found_nodes.push_back(storage::Gid::FromString(doc_json["metadata"]["gid"].dump())); + } + return found_nodes; +} + +std::string TextIndex::Aggregate(const std::string &index_name, const std::string &search_query, + const std::string &aggregation_query) { + if (!flags::AreExperimentsEnabled(flags::Experiments::TEXT_SEARCH)) { + throw query::TextSearchDisabledException(); + } + + if (!index_.contains(index_name)) { + throw query::TextSearchException("Text index \"{}\" doesn’t exist.", index_name); + } + + mgcxx::text_search::DocumentOutput aggregation_result; + try { + aggregation_result = mgcxx::text_search::aggregate( + index_.at(index_name).context_, + mgcxx::text_search::SearchInput{ + .search_fields = {"all"}, .search_query = search_query, .aggregation_query = aggregation_query}); + + } catch (const std::exception &e) { + throw query::TextSearchException("Tantivy error: {}", e.what()); + } + // The CXX .data() method (https://cxx.rs/binding/string.html) may overestimate string length, causing JSON parsing + // errors downstream. We prevent this by resizing the converted string with the correctly-working .length() method. + std::string result_string = aggregation_result.data.data(); + result_string.resize(aggregation_result.data.length()); + return result_string; +} + +void TextIndex::Commit() { + if (!flags::AreExperimentsEnabled(flags::Experiments::TEXT_SEARCH)) { + throw query::TextSearchDisabledException(); + } + + for (auto &[_, index_data] : index_) { + mgcxx::text_search::commit(index_data.context_); + } +} + +void TextIndex::Rollback() { + if (!flags::AreExperimentsEnabled(flags::Experiments::TEXT_SEARCH)) { + throw query::TextSearchDisabledException(); + } + + for (auto &[_, index_data] : index_) { + mgcxx::text_search::rollback(index_data.context_); + } +} + +std::vector> TextIndex::ListIndices() const { + std::vector> ret; + ret.reserve(index_.size()); + for (const auto &[index_name, index_data] : index_) { + ret.emplace_back(index_name, index_data.scope_); + } + return ret; +} + +} // namespace memgraph::storage diff --git a/src/storage/v2/indices/text_index.hpp b/src/storage/v2/indices/text_index.hpp new file mode 100644 index 000000000..af4748c6e --- /dev/null +++ b/src/storage/v2/indices/text_index.hpp @@ -0,0 +1,105 @@ +// Copyright 2024 Memgraph Ltd. +// +// Use of this software is governed by the Business Source License +// included in the file licenses/BSL.txt; by using this file, you agree to be bound by the terms of the Business Source +// License, and you may not use this file except in compliance with the Business Source License. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0, included in the file +// licenses/APL.txt. + +#pragma once + +#include +#include "mg_procedure.h" +#include "storage/v2/id_types.hpp" +#include "storage/v2/name_id_mapper.hpp" +#include "storage/v2/vertex.hpp" +#include "text_search.hpp" + +namespace memgraph::query { +class DbAccessor; +} + +namespace memgraph::storage { +struct TextIndexData { + mgcxx::text_search::Context context_; + LabelId scope_; +}; + +class TextIndex { + private: + static constexpr bool kDoSkipCommit = true; + static constexpr std::string_view kTextIndicesDirectory = "text_indices"; + + inline std::string MakeIndexPath(const std::filesystem::path &storage_dir, const std::string &index_name); + + void CreateEmptyIndex(const std::filesystem::path &storage_dir, const std::string &index_name, LabelId label); + + template + nlohmann::json SerializeProperties(const std::map &properties, T *name_resolver); + + std::string StringifyProperties(const std::map &properties); + + std::vector GetApplicableTextIndices(const std::vector &labels); + + void LoadNodeToTextIndices(const std::int64_t gid, const nlohmann::json &properties, + const std::string &property_values_as_str, + const std::vector &applicable_text_indices); + + void CommitLoadedNodes(mgcxx::text_search::Context &index_context); + + mgcxx::text_search::SearchOutput SearchGivenProperties(const std::string &index_name, + const std::string &search_query); + + mgcxx::text_search::SearchOutput RegexSearch(const std::string &index_name, const std::string &search_query); + + mgcxx::text_search::SearchOutput SearchAllProperties(const std::string &index_name, const std::string &search_query); + + public: + TextIndex() = default; + + TextIndex(const TextIndex &) = delete; + TextIndex(TextIndex &&) = delete; + TextIndex &operator=(const TextIndex &) = delete; + TextIndex &operator=(TextIndex &&) = delete; + + ~TextIndex() = default; + + std::map index_; + std::map label_to_index_; + + void AddNode( + Vertex *vertex, NameIdMapper *name_id_mapper, + const std::optional> &maybe_applicable_text_indices = std::nullopt); + + void UpdateNode(Vertex *vertex, NameIdMapper *name_id_mapper, const std::vector &removed_labels = {}); + + void RemoveNode( + Vertex *vertex, + const std::optional> &maybe_applicable_text_indices = std::nullopt); + + void CreateIndex(const std::filesystem::path &storage_dir, const std::string &index_name, LabelId label, + memgraph::query::DbAccessor *db); + + void RecoverIndex(const std::filesystem::path &storage_dir, const std::string &index_name, LabelId label, + memgraph::utils::SkipList::Accessor vertices, NameIdMapper *name_id_mapper); + + LabelId DropIndex(const std::filesystem::path &storage_dir, const std::string &index_name); + + bool IndexExists(const std::string &index_name) const; + + std::vector Search(const std::string &index_name, const std::string &search_query, text_search_mode search_mode); + + std::string Aggregate(const std::string &index_name, const std::string &search_query, + const std::string &aggregation_query); + + void Commit(); + + void Rollback(); + + std::vector> ListIndices() const; +}; + +} // namespace memgraph::storage diff --git a/src/storage/v2/inmemory/storage.cpp b/src/storage/v2/inmemory/storage.cpp index dab56750b..dbaa56ce2 100644 --- a/src/storage/v2/inmemory/storage.cpp +++ b/src/storage/v2/inmemory/storage.cpp @@ -15,6 +15,8 @@ #include #include #include "dbms/constants.hpp" +#include "flags/experimental.hpp" +#include "flags/run_time_configurable.hpp" #include "memory/global_memory_control.hpp" #include "storage/v2/durability/durability.hpp" #include "storage/v2/durability/snapshot.hpp" @@ -890,6 +892,10 @@ utils::BasicResult InMemoryStorage::InMemoryAcce commit_timestamp_.reset(); // We have aborted, hence we have not committed return StorageManipulationError{*unique_constraint_violation}; } + + if (flags::AreExperimentsEnabled(flags::Experiments::TEXT_SEARCH)) { + mem_storage->indices_.text_index_.Commit(); + } } is_transaction_active_ = false; @@ -1213,6 +1219,9 @@ void InMemoryStorage::InMemoryAccessor::Abort() { for (auto const &[property, prop_vertices] : property_cleanup) { storage_->indices_.AbortEntries(property, prop_vertices, transaction_.start_timestamp); } + if (flags::AreExperimentsEnabled(flags::Experiments::TEXT_SEARCH)) { + storage_->indices_.text_index_.Rollback(); + } // VERTICES { @@ -1846,6 +1855,7 @@ StorageInfo InMemoryStorage::GetInfo(memgraph::replication_coordination_glue::Re const auto &lbl = access->ListAllIndices(); info.label_indices = lbl.label.size(); info.label_property_indices = lbl.label_property.size(); + info.text_indices = lbl.text_indices.size(); const auto &con = access->ListAllConstraints(); info.existence_constraints = con.existence.size(); info.unique_constraints = con.unique.size(); @@ -2107,6 +2117,16 @@ bool InMemoryStorage::AppendToWal(const Transaction &transaction, uint64_t final AppendToWalDataDefinition(durability::StorageMetadataOperation::LABEL_PROPERTY_INDEX_STATS_CLEAR, info.label, final_commit_timestamp); } break; + case MetadataDelta::Action::TEXT_INDEX_CREATE: { + const auto &info = md_delta.text_index; + AppendToWalDataDefinition(durability::StorageMetadataOperation::TEXT_INDEX_CREATE, info.index_name, info.label, + final_commit_timestamp); + } break; + case MetadataDelta::Action::TEXT_INDEX_DROP: { + const auto &info = md_delta.text_index; + AppendToWalDataDefinition(durability::StorageMetadataOperation::TEXT_INDEX_DROP, info.index_name, info.label, + final_commit_timestamp); + } break; case MetadataDelta::Action::EXISTENCE_CONSTRAINT_CREATE: { const auto &info = md_delta.label_property; AppendToWalDataDefinition(durability::StorageMetadataOperation::EXISTENCE_CONSTRAINT_CREATE, info.label, @@ -2137,11 +2157,13 @@ bool InMemoryStorage::AppendToWal(const Transaction &transaction, uint64_t final return repl_storage_state_.FinalizeTransaction(final_commit_timestamp, this, std::move(db_acc)); } -void InMemoryStorage::AppendToWalDataDefinition(durability::StorageMetadataOperation operation, LabelId label, +void InMemoryStorage::AppendToWalDataDefinition(durability::StorageMetadataOperation operation, + const std::optional text_index_name, LabelId label, const std::set &properties, LabelIndexStats stats, LabelPropertyIndexStats property_stats, uint64_t final_commit_timestamp) { - wal_file_->AppendOperation(operation, label, properties, stats, property_stats, final_commit_timestamp); + wal_file_->AppendOperation(operation, text_index_name, label, properties, stats, property_stats, + final_commit_timestamp); repl_storage_state_.AppendOperation(operation, label, properties, stats, property_stats, final_commit_timestamp); } @@ -2155,12 +2177,13 @@ void InMemoryStorage::AppendToWalDataDefinition(durability::StorageMetadataOpera const std::set &properties, LabelPropertyIndexStats property_stats, uint64_t final_commit_timestamp) { - return AppendToWalDataDefinition(operation, label, properties, {}, property_stats, final_commit_timestamp); + return AppendToWalDataDefinition(operation, std::nullopt, label, properties, {}, property_stats, + final_commit_timestamp); } void InMemoryStorage::AppendToWalDataDefinition(durability::StorageMetadataOperation operation, LabelId label, LabelIndexStats stats, uint64_t final_commit_timestamp) { - return AppendToWalDataDefinition(operation, label, {}, stats, {}, final_commit_timestamp); + return AppendToWalDataDefinition(operation, std::nullopt, label, {}, stats, {}, final_commit_timestamp); } void InMemoryStorage::AppendToWalDataDefinition(durability::StorageMetadataOperation operation, LabelId label, @@ -2174,6 +2197,12 @@ void InMemoryStorage::AppendToWalDataDefinition(durability::StorageMetadataOpera return AppendToWalDataDefinition(operation, label, {}, {}, final_commit_timestamp); } +void InMemoryStorage::AppendToWalDataDefinition(durability::StorageMetadataOperation operation, + const std::optional text_index_name, LabelId label, + uint64_t final_commit_timestamp) { + return AppendToWalDataDefinition(operation, text_index_name, label, {}, {}, {}, final_commit_timestamp); +} + utils::BasicResult InMemoryStorage::CreateSnapshot( memgraph::replication_coordination_glue::ReplicationRole replication_role) { using memgraph::replication_coordination_glue::ReplicationRole; @@ -2301,7 +2330,9 @@ IndicesInfo InMemoryStorage::InMemoryAccessor::ListAllIndices() const { auto *mem_label_property_index = static_cast(in_memory->indices_.label_property_index_.get()); auto *mem_edge_type_index = static_cast(in_memory->indices_.edge_type_index_.get()); - return {mem_label_index->ListIndices(), mem_label_property_index->ListIndices(), mem_edge_type_index->ListIndices()}; + auto &text_index = storage_->indices_.text_index_; + return {mem_label_index->ListIndices(), mem_label_property_index->ListIndices(), mem_edge_type_index->ListIndices(), + text_index.ListIndices()}; } ConstraintsInfo InMemoryStorage::InMemoryAccessor::ListAllConstraints() const { const auto *mem_storage = static_cast(storage_); diff --git a/src/storage/v2/inmemory/storage.hpp b/src/storage/v2/inmemory/storage.hpp index 6d10e0fbd..6532471f3 100644 --- a/src/storage/v2/inmemory/storage.hpp +++ b/src/storage/v2/inmemory/storage.hpp @@ -398,7 +398,7 @@ class InMemoryStorage final : public Storage { StorageInfo GetBaseInfo() override; StorageInfo GetInfo(memgraph::replication_coordination_glue::ReplicationRole replication_role) override; - /// Return true in all cases excepted if any sync replicas have not sent confirmation. + /// Return true in all cases except if any sync replicas have not sent confirmation. [[nodiscard]] bool AppendToWal(const Transaction &transaction, uint64_t final_commit_timestamp, DatabaseAccessProtector db_acc); void AppendToWalDataDefinition(durability::StorageMetadataOperation operation, LabelId label, @@ -412,9 +412,13 @@ class InMemoryStorage final : public Storage { void AppendToWalDataDefinition(durability::StorageMetadataOperation operation, LabelId label, const std::set &properties, LabelPropertyIndexStats property_stats, uint64_t final_commit_timestamp); - void AppendToWalDataDefinition(durability::StorageMetadataOperation operation, LabelId label, + void AppendToWalDataDefinition(durability::StorageMetadataOperation operation, + const std::optional text_index_name, LabelId label, const std::set &properties, LabelIndexStats stats, LabelPropertyIndexStats property_stats, uint64_t final_commit_timestamp); + void AppendToWalDataDefinition(durability::StorageMetadataOperation operation, + const std::optional text_index_name, LabelId label, + uint64_t final_commit_timestamp); uint64_t CommitTimestamp(std::optional desired_commit_timestamp = {}); diff --git a/src/storage/v2/metadata_delta.hpp b/src/storage/v2/metadata_delta.hpp index b34966a62..e4616161d 100644 --- a/src/storage/v2/metadata_delta.hpp +++ b/src/storage/v2/metadata_delta.hpp @@ -37,6 +37,8 @@ struct MetadataDelta { LABEL_PROPERTY_INDEX_STATS_CLEAR, EDGE_INDEX_CREATE, EDGE_INDEX_DROP, + TEXT_INDEX_CREATE, + TEXT_INDEX_DROP, EXISTENCE_CONSTRAINT_CREATE, EXISTENCE_CONSTRAINT_DROP, UNIQUE_CONSTRAINT_CREATE, @@ -63,6 +65,10 @@ struct MetadataDelta { } edge_index_create; static constexpr struct EdgeIndexDrop { } edge_index_drop; + static constexpr struct TextIndexCreate { + } text_index_create; + static constexpr struct TextIndexDrop { + } text_index_drop; static constexpr struct ExistenceConstraintCreate { } existence_constraint_create; static constexpr struct ExistenceConstraintDrop { @@ -98,6 +104,12 @@ struct MetadataDelta { MetadataDelta(EdgeIndexDrop /*tag*/, EdgeTypeId edge_type) : action(Action::EDGE_INDEX_DROP), edge_type(edge_type) {} + MetadataDelta(TextIndexCreate /*tag*/, std::string index_name, LabelId label) + : action(Action::TEXT_INDEX_CREATE), text_index{index_name, label} {} + + MetadataDelta(TextIndexDrop /*tag*/, std::string index_name, LabelId label) + : action(Action::TEXT_INDEX_DROP), text_index{index_name, label} {} + MetadataDelta(ExistenceConstraintCreate /*tag*/, LabelId label, PropertyId property) : action(Action::EXISTENCE_CONSTRAINT_CREATE), label_property{label, property} {} @@ -127,6 +139,8 @@ struct MetadataDelta { case Action::LABEL_PROPERTY_INDEX_STATS_CLEAR: case Action::EDGE_INDEX_CREATE: case Action::EDGE_INDEX_DROP: + case Action::TEXT_INDEX_CREATE: + case Action::TEXT_INDEX_DROP: case Action::EXISTENCE_CONSTRAINT_CREATE: case Action::EXISTENCE_CONSTRAINT_DROP: break; @@ -164,6 +178,11 @@ struct MetadataDelta { PropertyId property; LabelPropertyIndexStats stats; } label_property_stats; + + struct { + std::string index_name; + LabelId label; + } text_index; }; }; diff --git a/src/storage/v2/property_store.cpp b/src/storage/v2/property_store.cpp index adf3440a2..0cfee0f98 100644 --- a/src/storage/v2/property_store.cpp +++ b/src/storage/v2/property_store.cpp @@ -118,7 +118,7 @@ enum class Type : uint8_t { STRING = 0x50, LIST = 0x60, MAP = 0x70, - TEMPORAL_DATA = 0x80 + TEMPORAL_DATA = 0x80, }; const uint8_t kMaskType = 0xf0; diff --git a/src/storage/v2/replication/replication_client.cpp b/src/storage/v2/replication/replication_client.cpp index a02c1eff0..ee1394fdb 100644 --- a/src/storage/v2/replication/replication_client.cpp +++ b/src/storage/v2/replication/replication_client.cpp @@ -406,8 +406,9 @@ void ReplicaStream::AppendOperation(durability::StorageMetadataOperation operati const std::set &properties, const LabelIndexStats &stats, const LabelPropertyIndexStats &property_stats, uint64_t timestamp) { replication::Encoder encoder(stream_.GetBuilder()); - EncodeOperation(&encoder, storage_->name_id_mapper_.get(), operation, label, properties, stats, property_stats, - timestamp); + // NOTE: Text search doesn’t have replication in scope yet (Phases 1 and 2) -> text index name not sent here + EncodeOperation(&encoder, storage_->name_id_mapper_.get(), operation, std::nullopt, label, properties, stats, + property_stats, timestamp); } void ReplicaStream::AppendOperation(durability::StorageMetadataOperation operation, EdgeTypeId edge_type, diff --git a/src/storage/v2/storage.cpp b/src/storage/v2/storage.cpp index 536a504a0..db4bec8be 100644 --- a/src/storage/v2/storage.cpp +++ b/src/storage/v2/storage.cpp @@ -13,6 +13,8 @@ #include "absl/container/flat_hash_set.h" #include "spdlog/spdlog.h" +#include "flags/experimental.hpp" +#include "flags/run_time_configurable.hpp" #include "storage/v2/disk/name_id_mapper.hpp" #include "storage/v2/storage.hpp" #include "storage/v2/transaction.hpp" @@ -273,6 +275,12 @@ Storage::Accessor::DetachDelete(std::vector nodes, std::vector return maybe_deleted_vertices.GetError(); } + if (flags::AreExperimentsEnabled(flags::Experiments::TEXT_SEARCH)) { + for (auto *node : nodes_to_delete) { + storage_->indices_.text_index_.RemoveNode(node); + } + } + auto deleted_vertices = maybe_deleted_vertices.GetValue(); return std::make_optional(std::move(deleted_vertices), std::move(deleted_edges)); @@ -543,4 +551,19 @@ void Storage::Accessor::MarkEdgeAsDeleted(Edge *edge) { } } +void Storage::Accessor::CreateTextIndex(const std::string &index_name, LabelId label, query::DbAccessor *db) { + MG_ASSERT(unique_guard_.owns_lock(), "Creating a text index requires unique access to storage!"); + storage_->indices_.text_index_.CreateIndex(storage_->config_.durability.storage_directory, index_name, label, db); + transaction_.md_deltas.emplace_back(MetadataDelta::text_index_create, index_name, label); + memgraph::metrics::IncrementCounter(memgraph::metrics::ActiveTextIndices); +} + +void Storage::Accessor::DropTextIndex(const std::string &index_name) { + MG_ASSERT(unique_guard_.owns_lock(), "Dropping a text index requires unique access to storage!"); + auto deleted_index_label = + storage_->indices_.text_index_.DropIndex(storage_->config_.durability.storage_directory, index_name); + transaction_.md_deltas.emplace_back(MetadataDelta::text_index_drop, index_name, deleted_index_label); + memgraph::metrics::DecrementCounter(memgraph::metrics::ActiveTextIndices); +} + } // namespace memgraph::storage diff --git a/src/storage/v2/storage.hpp b/src/storage/v2/storage.hpp index 58936bd56..a4436b1b7 100644 --- a/src/storage/v2/storage.hpp +++ b/src/storage/v2/storage.hpp @@ -20,6 +20,7 @@ #include "io/network/endpoint.hpp" #include "kvstore/kvstore.hpp" +#include "mg_procedure.h" #include "query/exceptions.hpp" #include "replication/config.hpp" #include "replication/replication_server.hpp" @@ -53,6 +54,7 @@ extern const Event SnapshotCreationLatency_us; extern const Event ActiveLabelIndices; extern const Event ActiveLabelPropertyIndices; +extern const Event ActiveTextIndices; } // namespace memgraph::metrics namespace memgraph::storage { @@ -63,6 +65,7 @@ struct IndicesInfo { std::vector label; std::vector> label_property; std::vector edge_type; + std::vector> text_indices; }; struct ConstraintsInfo { @@ -78,6 +81,7 @@ struct StorageInfo { uint64_t disk_usage; uint64_t label_indices; uint64_t label_property_indices; + uint64_t text_indices; uint64_t existence_constraints; uint64_t unique_constraints; StorageMode storage_mode; @@ -95,6 +99,7 @@ static inline nlohmann::json ToJson(const StorageInfo &info) { res["disk"] = info.disk_usage; res["label_indices"] = info.label_indices; res["label_prop_indices"] = info.label_property_indices; + res["text_indices"] = info.text_indices; res["existence_constraints"] = info.existence_constraints; res["unique_constraints"] = info.unique_constraints; res["storage_mode"] = storage::StorageModeToString(info.storage_mode); @@ -232,6 +237,28 @@ class Storage { virtual bool EdgeTypeIndexExists(EdgeTypeId edge_type) const = 0; + bool TextIndexExists(const std::string &index_name) const { + return storage_->indices_.text_index_.IndexExists(index_name); + } + + void TextIndexAddVertex(const VertexAccessor &vertex) { + storage_->indices_.text_index_.AddNode(vertex.vertex_, storage_->name_id_mapper_.get()); + } + + void TextIndexUpdateVertex(const VertexAccessor &vertex, const std::vector &removed_labels = {}) { + storage_->indices_.text_index_.UpdateNode(vertex.vertex_, storage_->name_id_mapper_.get(), removed_labels); + } + + std::vector TextIndexSearch(const std::string &index_name, const std::string &search_query, + text_search_mode search_mode) const { + return storage_->indices_.text_index_.Search(index_name, search_query, search_mode); + } + + std::string TextIndexAggregate(const std::string &index_name, const std::string &search_query, + const std::string &aggregation_query) const { + return storage_->indices_.text_index_.Aggregate(index_name, search_query, aggregation_query); + } + virtual IndicesInfo ListAllIndices() const = 0; virtual ConstraintsInfo ListAllConstraints() const = 0; @@ -284,6 +311,10 @@ class Storage { virtual utils::BasicResult DropIndex(EdgeTypeId edge_type) = 0; + void CreateTextIndex(const std::string &index_name, LabelId label, query::DbAccessor *db); + + void DropTextIndex(const std::string &index_name); + virtual utils::BasicResult CreateExistenceConstraint( LabelId label, PropertyId property) = 0; diff --git a/src/utils/event_counter.cpp b/src/utils/event_counter.cpp index 54ff4ed5c..7b1579a93 100644 --- a/src/utils/event_counter.cpp +++ b/src/utils/event_counter.cpp @@ -1,4 +1,4 @@ -// Copyright 2023 Memgraph Ltd. +// Copyright 2024 Memgraph Ltd. // // Use of this software is governed by the Business Source License // included in the file licenses/BSL.txt; by using this file, you agree to be bound by the terms of the Business Source @@ -60,6 +60,7 @@ \ M(ActiveLabelIndices, Index, "Number of active label indices in the system.") \ M(ActiveLabelPropertyIndices, Index, "Number of active label property indices in the system.") \ + M(ActiveTextIndices, Index, "Number of active text indices in the system.") \ \ M(StreamsCreated, Stream, "Number of Streams created.") \ M(MessagesConsumed, Stream, "Number of consumed streamed messages.") \ diff --git a/src/utils/typeinfo.hpp b/src/utils/typeinfo.hpp index 77910f731..aeb62d2c1 100644 --- a/src/utils/typeinfo.hpp +++ b/src/utils/typeinfo.hpp @@ -187,6 +187,7 @@ enum class TypeId : uint64_t { AST_PROFILE_QUERY, AST_INDEX_QUERY, AST_EDGE_INDEX_QUERY, + AST_TEXT_INDEX_QUERY, AST_CREATE, AST_CALL_PROCEDURE, AST_MATCH, diff --git a/tests/e2e/configuration/default_config.py b/tests/e2e/configuration/default_config.py index 75c211e0f..11435da65 100644 --- a/tests/e2e/configuration/default_config.py +++ b/tests/e2e/configuration/default_config.py @@ -226,6 +226,6 @@ startup_config_dict = { "experimental_enabled": ( "", "", - "Experimental features to be used, comma seperated. Options [system-replication, high-availability]", + "Experimental features to be used, comma-separated. Options [system-replication, text-search, high-availability]", ), } diff --git a/tests/e2e/text_search/CMakeLists.txt b/tests/e2e/text_search/CMakeLists.txt new file mode 100644 index 000000000..db2af7a11 --- /dev/null +++ b/tests/e2e/text_search/CMakeLists.txt @@ -0,0 +1,6 @@ +function(copy_text_search_e2e_python_files FILE_NAME) + copy_e2e_python_files(text_search ${FILE_NAME}) +endfunction() + +copy_text_search_e2e_python_files(common.py) +copy_text_search_e2e_python_files(test_text_search.py) diff --git a/tests/e2e/text_search/common.py b/tests/e2e/text_search/common.py new file mode 100644 index 000000000..0f28351d3 --- /dev/null +++ b/tests/e2e/text_search/common.py @@ -0,0 +1,87 @@ +# Copyright 2023 Memgraph Ltd. +# +# Use of this software is governed by the Business Source License +# included in the file licenses/BSL.txt; by using this file, you agree to be bound by the terms of the Business Source +# License, and you may not use this file except in compliance with the Business Source License. +# +# As of the Change Date specified in that file, in accordance with +# the Business Source License, use of this software will be governed +# by the Apache License, Version 2.0, included in the file +# licenses/APL.txt. + +import typing + +import mgclient +import pytest +from gqlalchemy import Memgraph + + +def execute_and_fetch_all(cursor: mgclient.Cursor, query: str, params: dict = {}) -> typing.List[tuple]: + cursor.execute(query, params) + return cursor.fetchall() + + +@pytest.fixture +def connect(**kwargs) -> mgclient.Connection: + connection = mgclient.connect(host="localhost", port=7687, **kwargs) + connection.autocommit = True + cursor = connection.cursor() + execute_and_fetch_all(cursor, """USE DATABASE memgraph""") + try: + execute_and_fetch_all(cursor, """DROP DATABASE clean""") + except: + pass + execute_and_fetch_all(cursor, """MATCH (n) DETACH DELETE n""") + yield connection + + +@pytest.fixture +def memgraph(**kwargs) -> Memgraph: + memgraph = Memgraph() + + yield memgraph + + memgraph.drop_database() + memgraph.drop_indexes() + + +@pytest.fixture +def memgraph_with_text_indexed_data(**kwargs) -> Memgraph: + memgraph = Memgraph() + + memgraph.execute( + """CREATE (:Document {title: "Rules2024", version: 1, fulltext: "random works", date: date("2023-11-14")});""" + ) + memgraph.execute( + """CREATE (:Document {title: "Rules2023", version: 9, fulltext: "text Rules2024", date: date("2023-11-14")});""" + ) + memgraph.execute( + """CREATE (:Document:Revision {title: "Rules2024", version: 2, fulltext: "random words", date: date("2023-12-15")});""" + ) + memgraph.execute("""CREATE (:Revision {title: "OperationSchema", version: 3, date: date("2023-10-01")});""") + memgraph.execute("""CREATE TEXT INDEX complianceDocuments ON :Document;""") + + yield memgraph + + memgraph.execute("""DROP TEXT INDEX complianceDocuments;""") + memgraph.drop_database() + memgraph.drop_indexes() + + +@pytest.fixture +def memgraph_with_mixed_data(**kwargs) -> Memgraph: + memgraph = Memgraph() + + memgraph.execute( + """CREATE (:Document:Revision {title: "Rules2024", version: 1, date: date("2023-11-14"), contents: "Lorem ipsum dolor sit amet"});""" + ) + memgraph.execute( + """CREATE (:Revision {title: "Rules2024", version: 2, date: date("2023-12-15"), contents: "consectetur adipiscing elit"});""" + ) + memgraph.execute("""CREATE TEXT INDEX complianceDocuments ON :Document;""") + + yield memgraph + + memgraph.execute("""DROP TEXT INDEX complianceDocuments;""") + memgraph.drop_database() + memgraph.drop_indexes() diff --git a/tests/e2e/text_search/test_text_search.py b/tests/e2e/text_search/test_text_search.py new file mode 100644 index 000000000..8d538d464 --- /dev/null +++ b/tests/e2e/text_search/test_text_search.py @@ -0,0 +1,206 @@ +# Copyright 2024 Memgraph Ltd. +# +# Use of this software is governed by the Business Source License +# included in the file licenses/BSL.txt; by using this file, you agree to be bound by the terms of the Business Source +# License, and you may not use this file except in compliance with the Business Source License. +# +# As of the Change Date specified in that file, in accordance with +# the Business Source License, use of this software will be governed +# by the Apache License, Version 2.0, included in the file +# licenses/APL.txt. + +import json +import re +import sys + +import gqlalchemy +import mgclient +import pytest +from common import memgraph, memgraph_with_mixed_data, memgraph_with_text_indexed_data + +GET_RULES_2024_DOCUMENT = """CALL libtext.search("complianceDocuments", "data.title:Rules2024") YIELD node + RETURN node.title AS title, node.version AS version + ORDER BY version ASC, title ASC;""" + + +def test_create_index(memgraph): + memgraph.execute("""CREATE TEXT INDEX exampleIndex ON :Document;""") + + index_info = memgraph.execute_and_fetch("""SHOW INDEX INFO""") + + assert list(index_info) == [ + {"index type": "text (name: exampleIndex)", "label": "Document", "property": None, "count": None} + ] + + +def test_drop_index(memgraph): + memgraph.execute("""DROP TEXT INDEX exampleIndex;""") + + index_info = memgraph.execute_and_fetch("""SHOW INDEX INFO""") + + assert list(index_info) == [] + + +def test_create_existing_index(memgraph): + memgraph.execute("""CREATE TEXT INDEX duplicatedIndex ON :Document;""") + with pytest.raises( + gqlalchemy.exceptions.GQLAlchemyDatabaseError, match='Text index "duplicatedIndex" already exists.' + ) as _: + memgraph.execute("""CREATE TEXT INDEX duplicatedIndex ON :Document;""") + memgraph.execute("""DROP TEXT INDEX duplicatedIndex;""") # cleanup + + +def test_drop_nonexistent_index(memgraph): + with pytest.raises( + gqlalchemy.exceptions.GQLAlchemyDatabaseError, match='Text index "noSuchIndex" doesn’t exist.' + ) as _: + memgraph.execute("""DROP TEXT INDEX noSuchIndex;""") + + +def test_text_search_given_property(memgraph_with_text_indexed_data): + result = list(memgraph_with_text_indexed_data.execute_and_fetch(GET_RULES_2024_DOCUMENT)) + + assert len(result) == 2 and result == [{"title": "Rules2024", "version": 1}, {"title": "Rules2024", "version": 2}] + + +def test_text_search_all_properties(memgraph_with_text_indexed_data): + SEARCH_QUERY = "Rules2024" + + ALL_PROPERTIES_QUERY = f"""CALL libtext.search_all("complianceDocuments", "{SEARCH_QUERY}") YIELD node + RETURN node + ORDER BY node.version ASC, node.title ASC;""" + + result = list(memgraph_with_text_indexed_data.execute_and_fetch(ALL_PROPERTIES_QUERY)) + result_nodes = [record["node"] for record in result] + + assert len(result) == 3 and ( + result_nodes[0].title == SEARCH_QUERY + and result_nodes[1].title == SEARCH_QUERY + and SEARCH_QUERY in result_nodes[2].fulltext + ) + + +def test_regex_text_search(memgraph_with_text_indexed_data): + REGEX_QUERY = """CALL libtext.regex_search("complianceDocuments", "wor.*s") YIELD node + RETURN node + ORDER BY node.version ASC, node.title ASC;""" + + result = list(memgraph_with_text_indexed_data.execute_and_fetch(REGEX_QUERY)) + + assert ( + len(result) == 2 + and re.search("wor.*s", result[0]["node"].fulltext) + and re.search("wor.*s", result[1]["node"].fulltext) + # In this test, all values matching the regex string are found in the .node property only ^ + ) + + +def test_text_search_aggregate(memgraph_with_text_indexed_data): + input_aggregation = json.dumps({"count": {"value_count": {"field": "metadata.gid"}}}, separators=(",", ":")) + expected_aggregation = json.dumps({"count": {"value": 2.0}}, separators=(",", ":")) + + AGGREGATION_QUERY = f"""CALL libtext.aggregate("complianceDocuments", "data.title:Rules2024", '{input_aggregation}') + YIELD aggregation + RETURN aggregation;""" + + result = list(memgraph_with_text_indexed_data.execute_and_fetch(AGGREGATION_QUERY)) + + assert len(result) == 1 and result[0]["aggregation"] == expected_aggregation + + +def test_text_search_query_boolean(memgraph_with_text_indexed_data): + BOOLEAN_QUERY = """CALL libtext.search("complianceDocuments", "(data.title:Rules2023 OR data.title:Rules2024) AND data.fulltext:words") YIELD node + RETURN node.title AS title, node.version AS version + ORDER BY version ASC, title ASC;""" + + result = list(memgraph_with_text_indexed_data.execute_and_fetch(BOOLEAN_QUERY)) + + assert len(result) == 1 and result == [{"title": "Rules2024", "version": 2}] + + +def test_create_indexed_node(memgraph_with_text_indexed_data): + memgraph_with_text_indexed_data.execute("""CREATE (:Document {title: "Rules2024", version: 3});""") + + result = list(memgraph_with_text_indexed_data.execute_and_fetch(GET_RULES_2024_DOCUMENT)) + + assert len(result) == 3 and result == [ + {"title": "Rules2024", "version": 1}, + {"title": "Rules2024", "version": 2}, + {"title": "Rules2024", "version": 3}, + ] + + +def test_delete_indexed_node(memgraph_with_text_indexed_data): + memgraph_with_text_indexed_data.execute("""MATCH (n:Document {title: "Rules2024", version: 2}) DETACH DELETE n;""") + + result = list(memgraph_with_text_indexed_data.execute_and_fetch(GET_RULES_2024_DOCUMENT)) + + assert len(result) == 1 and result == [{"title": "Rules2024", "version": 1}] + + +def test_add_indexed_label(memgraph_with_mixed_data): + memgraph_with_mixed_data.execute("""MATCH (n:Revision {version:2}) SET n:Document;""") + + result = list(memgraph_with_mixed_data.execute_and_fetch(GET_RULES_2024_DOCUMENT)) + + assert len(result) == 2 and result == [{"title": "Rules2024", "version": 1}, {"title": "Rules2024", "version": 2}] + + +def test_remove_indexed_label(memgraph_with_mixed_data): + memgraph_with_mixed_data.execute("""MATCH (n:Document {version: 1}) REMOVE n:Document;""") + + result = list(memgraph_with_mixed_data.execute_and_fetch(GET_RULES_2024_DOCUMENT)) + + assert len(result) == 0 + + +def test_update_text_property_of_indexed_node(memgraph_with_text_indexed_data): + memgraph_with_text_indexed_data.execute("""MATCH (n:Document {version:1}) SET n.title = "Rules2030";""") + + result = list( + memgraph_with_text_indexed_data.execute_and_fetch( + """CALL libtext.search("complianceDocuments", "data.title:Rules2030") YIELD node + RETURN node.title AS title, node.version AS version + ORDER BY version ASC, title ASC;""" + ) + ) + + assert len(result) == 1 and result == [{"title": "Rules2030", "version": 1}] + + +def test_add_unindexable_property_to_indexed_node(memgraph_with_text_indexed_data): + try: + memgraph_with_text_indexed_data.execute("""MATCH (n:Document {version:1}) SET n.randomList = [2, 3, 4, 5];""") + except Exception: + assert False + + +def test_remove_indexable_property_from_indexed_node(memgraph_with_text_indexed_data): + try: + memgraph_with_text_indexed_data.execute( + """MATCH (n:Document {version:1}) REMOVE n.title, n.version, n.fulltext, n.date;""" + ) + except Exception: + assert False + + +def test_remove_unindexable_property_from_indexed_node(memgraph_with_text_indexed_data): + try: + memgraph_with_text_indexed_data.execute_and_fetch( + """MATCH (n:Document {date: date("2023-12-15")}) REMOVE n.date;""" + ) + except Exception: + assert False + + +def test_text_search_nonexistent_index(memgraph_with_text_indexed_data): + NONEXISTENT_INDEX_QUERY = """CALL libtext.search("noSuchIndex", "data.fulltext:words") YIELD node + RETURN node.title AS title, node.version AS version + ORDER BY version ASC, title ASC;""" + + with pytest.raises(mgclient.DatabaseError, match='Text index "noSuchIndex" doesn’t exist.') as _: + list(memgraph_with_text_indexed_data.execute_and_fetch(NONEXISTENT_INDEX_QUERY)) + + +if __name__ == "__main__": + sys.exit(pytest.main([__file__, "-rA"])) diff --git a/tests/e2e/text_search/test_text_search_disabled.py b/tests/e2e/text_search/test_text_search_disabled.py new file mode 100644 index 000000000..064f7b409 --- /dev/null +++ b/tests/e2e/text_search/test_text_search_disabled.py @@ -0,0 +1,69 @@ +# Copyright 2024 Memgraph Ltd. +# +# Use of this software is governed by the Business Source License +# included in the file licenses/BSL.txt; by using this file, you agree to be bound by the terms of the Business Source +# License, and you may not use this file except in compliance with the Business Source License. +# +# As of the Change Date specified in that file, in accordance with +# the Business Source License, use of this software will be governed +# by the Apache License, Version 2.0, included in the file +# licenses/APL.txt. + +import json +import sys + +import gqlalchemy +import pytest +from common import memgraph + +TEXT_SEARCH_DISABLED_ERROR = ( + "To use text indices and text search, start Memgraph with the experimental text search feature enabled." +) + + +def test_create_index(memgraph): + with pytest.raises(gqlalchemy.exceptions.GQLAlchemyDatabaseError, match=TEXT_SEARCH_DISABLED_ERROR) as _: + memgraph.execute("""CREATE TEXT INDEX exampleIndex ON :Document;""") + + +def test_drop_index(memgraph): + with pytest.raises(gqlalchemy.exceptions.GQLAlchemyDatabaseError, match=TEXT_SEARCH_DISABLED_ERROR) as _: + memgraph.execute("""DROP TEXT INDEX exampleIndex;""") + + +def test_text_search_given_property(memgraph): + with pytest.raises(gqlalchemy.exceptions.GQLAlchemyDatabaseError, match=TEXT_SEARCH_DISABLED_ERROR) as _: + memgraph.execute( + """CALL libtext.search("complianceDocuments", "data.title:Rules2024") YIELD node + RETURN node;""" + ) + + +def test_text_search_all_properties(memgraph): + with pytest.raises(gqlalchemy.exceptions.GQLAlchemyDatabaseError, match=TEXT_SEARCH_DISABLED_ERROR) as _: + memgraph.execute( + """CALL libtext.search_all("complianceDocuments", "Rules2024") YIELD node + RETURN node;""" + ) + + +def test_regex_text_search(memgraph): + with pytest.raises(gqlalchemy.exceptions.GQLAlchemyDatabaseError, match=TEXT_SEARCH_DISABLED_ERROR) as _: + memgraph.execute( + """CALL libtext.regex_search("complianceDocuments", "wor.*s") YIELD node + RETURN node;""" + ) + + +def test_text_search_aggregate(memgraph): + with pytest.raises(gqlalchemy.exceptions.GQLAlchemyDatabaseError, match=TEXT_SEARCH_DISABLED_ERROR) as _: + input_aggregation = json.dumps({"count": {"value_count": {"field": "metadata.gid"}}}, separators=(",", ":")) + + memgraph.execute( + f"""CALL libtext.aggregate("complianceDocuments", "wor.*s", '{input_aggregation}') YIELD aggregation + RETURN aggregation;""" + ) + + +if __name__ == "__main__": + sys.exit(pytest.main([__file__, "-rA"])) diff --git a/tests/e2e/text_search/workloads.yaml b/tests/e2e/text_search/workloads.yaml new file mode 100644 index 000000000..5b1640715 --- /dev/null +++ b/tests/e2e/text_search/workloads.yaml @@ -0,0 +1,33 @@ +text_search_cluster: &text_search_cluster + cluster: + main: + args: + [ + "--bolt-port", + "7687", + "--log-level=TRACE", + "--experimental-enabled=text-search", + ] + log_file: "text_search.log" + setup_queries: [] + validation_queries: [] + +text_search_disabled_cluster: &text_search_disabled_cluster + cluster: + main: + args: ["--bolt-port", "7687", "--log-level=TRACE"] + log_file: "text_search.log" + setup_queries: [] + validation_queries: [] + +workloads: + - name: "Test behavior of text search in Memgraph" + binary: "tests/e2e/pytest_runner.sh" + proc: "tests/e2e/text_search/query_modules/" + args: ["text_search/test_text_search.py"] + <<: *text_search_cluster + - name: "Test behavior of text search in Memgraph when disabled" + binary: "tests/e2e/pytest_runner.sh" + proc: "tests/e2e/text_search/query_modules/" + args: ["text_search/test_text_search_disabled.py"] + <<: *text_search_disabled_cluster diff --git a/tests/unit/query_dump.cpp b/tests/unit/query_dump.cpp index a2ca2864d..2dd1e7ac7 100644 --- a/tests/unit/query_dump.cpp +++ b/tests/unit/query_dump.cpp @@ -71,6 +71,11 @@ struct DatabaseState { std::string property; }; + struct TextItem { + std::string index_name; + std::string label; + }; + struct LabelPropertiesItem { std::string label; std::set> properties; @@ -80,6 +85,7 @@ struct DatabaseState { std::set edges; std::set label_indices; std::set label_property_indices; + std::set text_indices; std::set existence_constraints; std::set unique_constraints; }; @@ -106,6 +112,10 @@ bool operator<(const DatabaseState::LabelPropertyItem &first, const DatabaseStat return first.property < second.property; } +bool operator<(const DatabaseState::TextItem &first, const DatabaseState::TextItem &second) { + return first.index_name < second.index_name && first.label < second.label; +} + bool operator<(const DatabaseState::LabelPropertiesItem &first, const DatabaseState::LabelPropertiesItem &second) { if (first.label != second.label) return first.label < second.label; return first.properties < second.properties; @@ -128,6 +138,10 @@ bool operator==(const DatabaseState::LabelPropertyItem &first, const DatabaseSta return first.label == second.label && first.property == second.property; } +bool operator==(const DatabaseState::TextItem &first, const DatabaseState::TextItem &second) { + return first.index_name == second.index_name && first.label == second.label; +} + bool operator==(const DatabaseState::LabelPropertiesItem &first, const DatabaseState::LabelPropertiesItem &second) { return first.label == second.label && first.properties == second.properties; } @@ -185,6 +199,7 @@ DatabaseState GetState(memgraph::storage::Storage *db) { // Capture all indices std::set label_indices; std::set label_property_indices; + std::set text_indices; { auto info = dba->ListAllIndices(); for (const auto &item : info.label) { @@ -193,6 +208,9 @@ DatabaseState GetState(memgraph::storage::Storage *db) { for (const auto &item : info.label_property) { label_property_indices.insert({dba->LabelToName(item.first), dba->PropertyToName(item.second)}); } + for (const auto &item : info.text_indices) { + text_indices.insert({item.first, dba->LabelToName(item.second)}); + } } // Capture all constraints @@ -212,7 +230,8 @@ DatabaseState GetState(memgraph::storage::Storage *db) { } } - return {vertices, edges, label_indices, label_property_indices, existence_constraints, unique_constraints}; + return {vertices, edges, label_indices, label_property_indices, text_indices, existence_constraints, + unique_constraints}; } auto Execute(memgraph::query::InterpreterContext *context, memgraph::dbms::DatabaseAccess db, diff --git a/tests/unit/storage_v2_decoder_encoder.cpp b/tests/unit/storage_v2_decoder_encoder.cpp index 15db49b1c..0264e2287 100644 --- a/tests/unit/storage_v2_decoder_encoder.cpp +++ b/tests/unit/storage_v2_decoder_encoder.cpp @@ -358,6 +358,8 @@ TEST_F(DecoderEncoderTest, PropertyValueInvalidMarker) { case memgraph::storage::durability::Marker::DELTA_LABEL_PROPERTY_INDEX_STATS_CLEAR: case memgraph::storage::durability::Marker::DELTA_EDGE_TYPE_INDEX_CREATE: case memgraph::storage::durability::Marker::DELTA_EDGE_TYPE_INDEX_DROP: + case memgraph::storage::durability::Marker::DELTA_TEXT_INDEX_CREATE: + case memgraph::storage::durability::Marker::DELTA_TEXT_INDEX_DROP: case memgraph::storage::durability::Marker::DELTA_EXISTENCE_CONSTRAINT_CREATE: case memgraph::storage::durability::Marker::DELTA_EXISTENCE_CONSTRAINT_DROP: case memgraph::storage::durability::Marker::DELTA_UNIQUE_CONSTRAINT_CREATE: diff --git a/tests/unit/storage_v2_get_info.cpp b/tests/unit/storage_v2_get_info.cpp index 71dbc1a8d..ee5c1bb62 100644 --- a/tests/unit/storage_v2_get_info.cpp +++ b/tests/unit/storage_v2_get_info.cpp @@ -146,6 +146,7 @@ TYPED_TEST(InfoTest, InfoCheck) { ASSERT_LT(info.disk_usage, 1000'000); ASSERT_EQ(info.label_indices, 1); ASSERT_EQ(info.label_property_indices, 1); + ASSERT_EQ(info.text_indices, 0); ASSERT_EQ(info.existence_constraints, 0); ASSERT_EQ(info.unique_constraints, 2); ASSERT_EQ(info.storage_mode, this->mode); diff --git a/tests/unit/storage_v2_wal_file.cpp b/tests/unit/storage_v2_wal_file.cpp index 4094090f5..a94b20590 100644 --- a/tests/unit/storage_v2_wal_file.cpp +++ b/tests/unit/storage_v2_wal_file.cpp @@ -53,6 +53,10 @@ memgraph::storage::durability::WalDeltaData::Type StorageMetadataOperationToWalD return memgraph::storage::durability::WalDeltaData::Type::LABEL_PROPERTY_INDEX_STATS_SET; case memgraph::storage::durability::StorageMetadataOperation::LABEL_PROPERTY_INDEX_STATS_CLEAR: return memgraph::storage::durability::WalDeltaData::Type::LABEL_PROPERTY_INDEX_STATS_CLEAR; + case memgraph::storage::durability::StorageMetadataOperation::TEXT_INDEX_CREATE: + return memgraph::storage::durability::WalDeltaData::Type::TEXT_INDEX_CREATE; + case memgraph::storage::durability::StorageMetadataOperation::TEXT_INDEX_DROP: + return memgraph::storage::durability::WalDeltaData::Type::TEXT_INDEX_DROP; case memgraph::storage::durability::StorageMetadataOperation::EXISTENCE_CONSTRAINT_CREATE: return memgraph::storage::durability::WalDeltaData::Type::EXISTENCE_CONSTRAINT_CREATE; case memgraph::storage::durability::StorageMetadataOperation::EXISTENCE_CONSTRAINT_DROP: @@ -252,7 +256,7 @@ class DeltaGenerator final { ASSERT_TRUE(false) << "Unexpected statistics operation!"; } } - wal_file_.AppendOperation(operation, label_id, property_ids, l_stats, lp_stats, timestamp_); + wal_file_.AppendOperation(operation, std::nullopt, label_id, property_ids, l_stats, lp_stats, timestamp_); if (valid_) { UpdateStats(timestamp_, 1); memgraph::storage::durability::WalDeltaData data; @@ -271,6 +275,8 @@ class DeltaGenerator final { break; case memgraph::storage::durability::StorageMetadataOperation::LABEL_PROPERTY_INDEX_CREATE: case memgraph::storage::durability::StorageMetadataOperation::LABEL_PROPERTY_INDEX_DROP: + case memgraph::storage::durability::StorageMetadataOperation::TEXT_INDEX_CREATE: + case memgraph::storage::durability::StorageMetadataOperation::TEXT_INDEX_DROP: case memgraph::storage::durability::StorageMetadataOperation::EXISTENCE_CONSTRAINT_CREATE: case memgraph::storage::durability::StorageMetadataOperation::EXISTENCE_CONSTRAINT_DROP: data.operation_label_property.label = label; @@ -313,6 +319,8 @@ class DeltaGenerator final { case memgraph::storage::durability::StorageMetadataOperation::LABEL_INDEX_STATS_SET: case memgraph::storage::durability::StorageMetadataOperation::LABEL_PROPERTY_INDEX_CREATE: case memgraph::storage::durability::StorageMetadataOperation::LABEL_PROPERTY_INDEX_DROP: + case memgraph::storage::durability::StorageMetadataOperation::TEXT_INDEX_CREATE: + case memgraph::storage::durability::StorageMetadataOperation::TEXT_INDEX_DROP: case memgraph::storage::durability::StorageMetadataOperation::EXISTENCE_CONSTRAINT_CREATE: case memgraph::storage::durability::StorageMetadataOperation::EXISTENCE_CONSTRAINT_DROP:; case memgraph::storage::durability::StorageMetadataOperation::LABEL_PROPERTY_INDEX_STATS_SET: