Add support for all-property and regex text search

This commit is contained in:
Ante Pušić 2024-02-19 17:38:16 +01:00
parent 0490319d3a
commit e19bc6d537
10 changed files with 149 additions and 30 deletions

View File

@ -331,8 +331,8 @@ inline bool graph_has_text_index(mgp_graph *graph, const char *index_name) {
}
inline mgp_map *graph_search_text_index(mgp_graph *graph, const char *index_name, const char *search_query,
mgp_memory *memory) {
return MgInvoke<mgp_map *>(mgp_graph_search_text_index, graph, index_name, search_query, memory);
const char *search_mode, mgp_memory *memory) {
return MgInvoke<mgp_map *>(mgp_graph_search_text_index, graph, index_name, search_query, search_mode, memory);
}
inline mgp_vertices_iterator *graph_iter_vertices(mgp_graph *g, mgp_memory *memory) {

View File

@ -899,7 +899,7 @@ enum mgp_error mgp_graph_has_text_index(struct mgp_graph *graph, const char *ind
/// the given query.
/// Return mgp_error::MGP_ERROR_UNABLE_TO_ALLOCATE if unable to allocate search result vertices.
enum mgp_error mgp_graph_search_text_index(struct mgp_graph *graph, const char *index_name, const char *search_query,
struct mgp_memory *memory, struct mgp_map **result);
const char *search_mode, struct mgp_memory *memory, struct mgp_map **result);
/// Creates label index for given label.
/// mgp_error::MGP_ERROR_NO_ERROR is always returned.

View File

@ -4340,9 +4340,10 @@ inline List ListAllLabelPropertyIndices(mgp_graph *memgraph_graph) {
return List(label_property_indices);
}
inline List RunTextSearchQuery(mgp_graph *memgraph_graph, std::string_view index_name, std::string_view search_query) {
auto results_or_error =
Map(mgp::MemHandlerCallback(graph_search_text_index, memgraph_graph, index_name.data(), search_query.data()));
inline List RunTextSearchQuery(mgp_graph *memgraph_graph, std::string_view index_name, std::string_view search_query,
std::string_view search_mode) {
auto results_or_error = Map(mgp::MemHandlerCallback(graph_search_text_index, memgraph_graph, index_name.data(),
search_query.data(), search_mode.data()));
auto maybe_error = results_or_error["error_msg"].ValueString();
if (!maybe_error.empty()) {
throw std::runtime_error{maybe_error.data()};

View File

@ -566,8 +566,9 @@ class DbAccessor final {
accessor_->TextIndexUpdateVertex(vertex.impl_, removed_labels);
}
std::vector<storage::Gid> TextIndexSearch(const std::string &index_name, const std::string &search_query) const {
return accessor_->TextIndexSearch(index_name, search_query);
std::vector<storage::Gid> TextIndexSearch(const std::string &index_name, const std::string &search_query,
const std::string &search_mode) const {
return accessor_->TextIndexSearch(index_name, search_query, search_mode);
}
std::optional<storage::LabelIndexStats> GetIndexStats(const storage::LabelId &label) const {

View File

@ -3420,12 +3420,12 @@ void WrapTextSearch(std::vector<memgraph::storage::Gid> vertex_ids, std::string
}
mgp_error mgp_graph_search_text_index(mgp_graph *graph, const char *index_name, const char *search_query,
mgp_memory *memory, mgp_map **result) {
return WrapExceptions([graph, memory, index_name, search_query, result]() {
const char *search_mode, mgp_memory *memory, mgp_map **result) {
return WrapExceptions([graph, memory, index_name, search_query, search_mode, result]() {
std::vector<memgraph::storage::Gid> search_results;
std::string error_msg;
try {
search_results = graph->getImpl()->TextIndexSearch(index_name, search_query);
search_results = graph->getImpl()->TextIndexSearch(index_name, search_query, search_mode);
} catch (memgraph::query::QueryException &e) {
error_msg = e.what();
}

View File

@ -38,6 +38,7 @@ void TextIndex::CreateEmptyIndex(const std::string &index_name, LabelId label) {
mappings["properties"] = {};
mappings["properties"]["metadata"] = {{"type", "json"}, {"fast", true}, {"stored", true}, {"text", true}};
mappings["properties"]["data"] = {{"type", "json"}, {"fast", true}, {"stored", true}, {"text", true}};
mappings["properties"]["all"] = {{"type", "text"}, {"fast", true}, {"stored", true}, {"text", true}};
index_.emplace(index_name,
TextIndexData{.context_ = mgcxx::text_search::create_index(
@ -78,6 +79,33 @@ nlohmann::json TextIndex::SerializeProperties(const std::map<PropertyId, Propert
return serialized_properties;
}
std::string TextIndex::CopyPropertyValuesToString(const std::map<PropertyId, PropertyValue> &properties) {
std::vector<std::string> indexable_properties_as_string;
for (const auto &[_, prop_value] : properties) {
switch (prop_value.type()) {
case PropertyValue::Type::Bool:
indexable_properties_as_string.push_back(prop_value.ValueBool() ? "true" : "false");
break;
case PropertyValue::Type::Int:
indexable_properties_as_string.push_back(std::to_string(prop_value.ValueInt()));
break;
case PropertyValue::Type::Double:
indexable_properties_as_string.push_back(std::to_string(prop_value.ValueDouble()));
break;
case PropertyValue::Type::String:
indexable_properties_as_string.push_back(prop_value.ValueString());
break;
case PropertyValue::Type::Null:
case PropertyValue::Type::List:
case PropertyValue::Type::Map:
case PropertyValue::Type::TemporalData:
default:
continue;
}
}
return utils::Join(indexable_properties_as_string, " ");
}
std::vector<mgcxx::text_search::Context *> TextIndex::GetApplicableTextIndices(const std::vector<LabelId> &labels) {
if (!flags::AreExperimentsEnabled(flags::Experiments::TEXT_SEARCH)) {
throw query::TextSearchDisabledException();
@ -93,11 +121,13 @@ std::vector<mgcxx::text_search::Context *> TextIndex::GetApplicableTextIndices(c
}
void TextIndex::LoadNodeToTextIndices(const std::int64_t gid, const nlohmann::json &properties,
const std::string &all_property_values_string,
const std::vector<mgcxx::text_search::Context *> &applicable_text_indices) {
// NOTE: Text indexes are presently all-property indices. If we allow text indexes restricted to specific properties,
// an indexable document should be created for each applicable index.
nlohmann::json document = {};
document["data"] = properties;
document["all"] = all_property_values_string;
document["metadata"] = {};
document["metadata"]["gid"] = gid;
document["metadata"]["deleted"] = false;
@ -135,7 +165,7 @@ void TextIndex::AddNode(Vertex *vertex_after_update, NameIdMapper *name_id_mappe
auto vertex_properties = vertex_after_update->properties.Properties();
LoadNodeToTextIndices(vertex_after_update->gid.AsInt(), SerializeProperties(vertex_properties, name_id_mapper),
applicable_text_indices);
CopyPropertyValuesToString(vertex_properties), applicable_text_indices);
}
void TextIndex::AddNode(Vertex *vertex_after_update, NameIdMapper *name_id_mapper) {
@ -203,7 +233,7 @@ void TextIndex::CreateIndex(const std::string &index_name, LabelId label, memgra
auto vertex_properties = v.Properties(View::NEW).GetValue();
LoadNodeToTextIndices(v.Gid().AsInt(), SerializeProperties(vertex_properties, db),
{&index_.at(index_name).context_});
CopyPropertyValuesToString(vertex_properties), {&index_.at(index_name).context_});
}
CommitLoadedNodes(index_.at(index_name).context_);
@ -221,7 +251,7 @@ void TextIndex::RecoverIndex(const std::string &index_name, LabelId label,
nlohmann::json document = {};
auto vertex_properties = v.properties.Properties();
LoadNodeToTextIndices(v.gid.AsInt(), SerializeProperties(vertex_properties, name_id_mapper),
{&index_.at(index_name).context_});
CopyPropertyValuesToString(vertex_properties), {&index_.at(index_name).context_});
}
CommitLoadedNodes(index_.at(index_name).context_);
@ -251,7 +281,57 @@ LabelId TextIndex::DropIndex(const std::string &index_name) {
bool TextIndex::IndexExists(const std::string &index_name) const { return index_.contains(index_name); }
std::vector<Gid> TextIndex::Search(const std::string &index_name, const std::string &search_query) {
mgcxx::text_search::SearchOutput TextIndex::TQLSearch(const std::string &index_name, const std::string &search_query) {
auto input = mgcxx::text_search::SearchInput{.search_query = search_query, .return_fields = {"data", "metadata"}};
// // Basic check for search fields in the query (Tantivy syntax delimits them with a `:` to the right)
// if (search_query.find(":") == std::string::npos) {
// input.search_fields = {"data"};
// }
mgcxx::text_search::SearchOutput search_results;
try {
search_results = mgcxx::text_search::search(index_.at(index_name).context_, input);
} catch (const std::exception &e) {
throw query::TextSearchException("Tantivy error: {}", e.what());
}
return search_results;
}
mgcxx::text_search::SearchOutput TextIndex::RegexSearch(const std::string &index_name,
const std::string &search_query) {
auto input = mgcxx::text_search::SearchInput{
.search_fields = {"all"}, .search_query = search_query, .return_fields = {"metadata"}};
mgcxx::text_search::SearchOutput search_results;
try {
search_results = mgcxx::text_search::regex_search(index_.at(index_name).context_, input);
} catch (const std::exception &e) {
throw query::TextSearchException("Tantivy error: {}", e.what());
}
return search_results;
}
mgcxx::text_search::SearchOutput TextIndex::SearchAllProperties(const std::string &index_name,
const std::string &search_query) {
auto input = mgcxx::text_search::SearchInput{
.search_fields = {"all"}, .search_query = search_query, .return_fields = {"metadata"}};
mgcxx::text_search::SearchOutput search_results;
try {
search_results = mgcxx::text_search::search(index_.at(index_name).context_, input);
} catch (const std::exception &e) {
throw query::TextSearchException("Tantivy error: {}", e.what());
}
return search_results;
}
std::vector<Gid> TextIndex::Search(const std::string &index_name, const std::string &search_query,
const std::string &search_mode) {
if (!flags::AreExperimentsEnabled(flags::Experiments::TEXT_SEARCH)) {
throw query::TextSearchDisabledException();
}
@ -260,20 +340,18 @@ std::vector<Gid> TextIndex::Search(const std::string &index_name, const std::str
throw query::TextSearchException("Text index \"{}\" doesnt exist.", index_name);
}
auto input = mgcxx::text_search::SearchInput{.search_query = search_query, .return_fields = {"data", "metadata"}};
// Basic check for search fields in the query (Tantivy syntax delimits them with a `:` to the right)
if (search_query.find(":") == std::string::npos) {
input.search_fields = {"data"};
mgcxx::text_search::SearchOutput search_results;
if (search_mode == "specify_property") {
search_results = TQLSearch(index_name, search_query);
} else if (search_mode == "regex") {
search_results = RegexSearch(index_name, search_query);
} else if (search_mode == "all_properties") {
search_results = SearchAllProperties(index_name, search_query);
} else {
throw query::TextSearchException("Unsupported search type"); // TODO improve
}
std::vector<Gid> found_nodes;
mgcxx::text_search::SearchOutput search_results;
try {
search_results = mgcxx::text_search::search(index_.at(index_name).context_, input);
} catch (const std::exception &e) {
throw query::TextSearchException("Tantivy error: {}", e.what());
}
for (const auto &doc : search_results.docs) {
// The CXX .data() method (https://cxx.rs/binding/string.html) may overestimate string length, causing JSON parsing
// errors downstream. We prevent this by resizing the converted string with the correctly-working .length() method.

View File

@ -36,9 +36,12 @@ class TextIndex {
template <typename T>
nlohmann::json SerializeProperties(const std::map<PropertyId, PropertyValue> &properties, T *name_resolver);
std::string CopyPropertyValuesToString(const std::map<PropertyId, PropertyValue> &properties);
std::vector<mgcxx::text_search::Context *> GetApplicableTextIndices(const std::vector<LabelId> &labels);
void LoadNodeToTextIndices(const std::int64_t gid, const nlohmann::json &properties,
const std::string &all_property_values_string,
const std::vector<mgcxx::text_search::Context *> &applicable_text_indices);
void CommitLoadedNodes(mgcxx::text_search::Context &index_context);
@ -48,6 +51,12 @@ class TextIndex {
void RemoveNode(Vertex *vertex, const std::vector<mgcxx::text_search::Context *> &applicable_text_indices);
mgcxx::text_search::SearchOutput TQLSearch(const std::string &index_name, const std::string &search_query);
mgcxx::text_search::SearchOutput RegexSearch(const std::string &index_name, const std::string &search_query);
mgcxx::text_search::SearchOutput SearchAllProperties(const std::string &index_name, const std::string &search_query);
public:
TextIndex() = default;
@ -76,7 +85,8 @@ class TextIndex {
bool IndexExists(const std::string &index_name) const;
std::vector<Gid> Search(const std::string &index_name, const std::string &search_query);
std::vector<Gid> Search(const std::string &index_name, const std::string &search_query,
const std::string &search_mode);
void Commit();

View File

@ -244,8 +244,9 @@ class Storage {
storage_->indices_.text_index_.UpdateNode(vertex.vertex_, storage_->name_id_mapper_.get(), removed_labels);
}
std::vector<Gid> TextIndexSearch(const std::string &index_name, const std::string &search_query) const {
return storage_->indices_.text_index_.Search(index_name, search_query);
std::vector<Gid> TextIndexSearch(const std::string &index_name, const std::string &search_query,
const std::string &search_mode) const {
return storage_->indices_.text_index_.Search(index_name, search_query, search_mode);
}
virtual IndicesInfo ListAllIndices() const = 0;

View File

@ -50,7 +50,7 @@ def memgraph_with_text_indexed_data(**kwargs) -> Memgraph:
memgraph = Memgraph()
memgraph.execute(
"""CREATE (:Document {title: "Rules2024", version: 1, fulltext: "random fulltext", date: date("2023-11-14")});"""
"""CREATE (:Document {title: "Rules2023", version: 1, fulltext: "Rules2024", date: date("2023-11-14")});"""
)
memgraph.execute(
"""CREATE (:Document:Revision {title: "Rules2024", version: 2, fulltext: "random words", date: date("2023-12-15")});"""

View File

@ -59,6 +59,34 @@ def test_text_search_given_property(memgraph_with_text_indexed_data):
assert len(result) == 2 and result == [{"title": "Rules2024", "version": 1}, {"title": "Rules2024", "version": 2}]
def test_text_search_all_properties(memgraph_with_text_indexed_data):
result = list(
memgraph_with_text_indexed_data.execute_and_fetch(
"""CALL text_search.search_all("complianceDocuments", "Rules2024") YIELD node
RETURN node
ORDER BY node.version ASC, node.title ASC;"""
)
)
print(result)
assert True
def test_text_search_regex(memgraph_with_text_indexed_data):
result = list(
memgraph_with_text_indexed_data.execute_and_fetch(
"""CALL text_search.regex_search("complianceDocuments", "Rules*") YIELD node
RETURN node
ORDER BY node.version ASC, node.title ASC;"""
)
)
print(result)
assert True
def test_text_search_query_boolean(memgraph_with_text_indexed_data):
BOOLEAN_QUERY = """CALL text_search.search("complianceDocuments", "(data.title:Rules2023 OR data.title:Rules2024) AND data.fulltext:words") YIELD node
RETURN node.title AS title, node.version AS version