Add support for all-property and regex text search
This commit is contained in:
parent
0490319d3a
commit
e19bc6d537
@ -331,8 +331,8 @@ inline bool graph_has_text_index(mgp_graph *graph, const char *index_name) {
|
||||
}
|
||||
|
||||
inline mgp_map *graph_search_text_index(mgp_graph *graph, const char *index_name, const char *search_query,
|
||||
mgp_memory *memory) {
|
||||
return MgInvoke<mgp_map *>(mgp_graph_search_text_index, graph, index_name, search_query, memory);
|
||||
const char *search_mode, mgp_memory *memory) {
|
||||
return MgInvoke<mgp_map *>(mgp_graph_search_text_index, graph, index_name, search_query, search_mode, memory);
|
||||
}
|
||||
|
||||
inline mgp_vertices_iterator *graph_iter_vertices(mgp_graph *g, mgp_memory *memory) {
|
||||
|
@ -899,7 +899,7 @@ enum mgp_error mgp_graph_has_text_index(struct mgp_graph *graph, const char *ind
|
||||
/// the given query.
|
||||
/// Return mgp_error::MGP_ERROR_UNABLE_TO_ALLOCATE if unable to allocate search result vertices.
|
||||
enum mgp_error mgp_graph_search_text_index(struct mgp_graph *graph, const char *index_name, const char *search_query,
|
||||
struct mgp_memory *memory, struct mgp_map **result);
|
||||
const char *search_mode, struct mgp_memory *memory, struct mgp_map **result);
|
||||
|
||||
/// Creates label index for given label.
|
||||
/// mgp_error::MGP_ERROR_NO_ERROR is always returned.
|
||||
|
@ -4340,9 +4340,10 @@ inline List ListAllLabelPropertyIndices(mgp_graph *memgraph_graph) {
|
||||
return List(label_property_indices);
|
||||
}
|
||||
|
||||
inline List RunTextSearchQuery(mgp_graph *memgraph_graph, std::string_view index_name, std::string_view search_query) {
|
||||
auto results_or_error =
|
||||
Map(mgp::MemHandlerCallback(graph_search_text_index, memgraph_graph, index_name.data(), search_query.data()));
|
||||
inline List RunTextSearchQuery(mgp_graph *memgraph_graph, std::string_view index_name, std::string_view search_query,
|
||||
std::string_view search_mode) {
|
||||
auto results_or_error = Map(mgp::MemHandlerCallback(graph_search_text_index, memgraph_graph, index_name.data(),
|
||||
search_query.data(), search_mode.data()));
|
||||
auto maybe_error = results_or_error["error_msg"].ValueString();
|
||||
if (!maybe_error.empty()) {
|
||||
throw std::runtime_error{maybe_error.data()};
|
||||
|
@ -566,8 +566,9 @@ class DbAccessor final {
|
||||
accessor_->TextIndexUpdateVertex(vertex.impl_, removed_labels);
|
||||
}
|
||||
|
||||
std::vector<storage::Gid> TextIndexSearch(const std::string &index_name, const std::string &search_query) const {
|
||||
return accessor_->TextIndexSearch(index_name, search_query);
|
||||
std::vector<storage::Gid> TextIndexSearch(const std::string &index_name, const std::string &search_query,
|
||||
const std::string &search_mode) const {
|
||||
return accessor_->TextIndexSearch(index_name, search_query, search_mode);
|
||||
}
|
||||
|
||||
std::optional<storage::LabelIndexStats> GetIndexStats(const storage::LabelId &label) const {
|
||||
|
@ -3420,12 +3420,12 @@ void WrapTextSearch(std::vector<memgraph::storage::Gid> vertex_ids, std::string
|
||||
}
|
||||
|
||||
mgp_error mgp_graph_search_text_index(mgp_graph *graph, const char *index_name, const char *search_query,
|
||||
mgp_memory *memory, mgp_map **result) {
|
||||
return WrapExceptions([graph, memory, index_name, search_query, result]() {
|
||||
const char *search_mode, mgp_memory *memory, mgp_map **result) {
|
||||
return WrapExceptions([graph, memory, index_name, search_query, search_mode, result]() {
|
||||
std::vector<memgraph::storage::Gid> search_results;
|
||||
std::string error_msg;
|
||||
try {
|
||||
search_results = graph->getImpl()->TextIndexSearch(index_name, search_query);
|
||||
search_results = graph->getImpl()->TextIndexSearch(index_name, search_query, search_mode);
|
||||
} catch (memgraph::query::QueryException &e) {
|
||||
error_msg = e.what();
|
||||
}
|
||||
|
@ -38,6 +38,7 @@ void TextIndex::CreateEmptyIndex(const std::string &index_name, LabelId label) {
|
||||
mappings["properties"] = {};
|
||||
mappings["properties"]["metadata"] = {{"type", "json"}, {"fast", true}, {"stored", true}, {"text", true}};
|
||||
mappings["properties"]["data"] = {{"type", "json"}, {"fast", true}, {"stored", true}, {"text", true}};
|
||||
mappings["properties"]["all"] = {{"type", "text"}, {"fast", true}, {"stored", true}, {"text", true}};
|
||||
|
||||
index_.emplace(index_name,
|
||||
TextIndexData{.context_ = mgcxx::text_search::create_index(
|
||||
@ -78,6 +79,33 @@ nlohmann::json TextIndex::SerializeProperties(const std::map<PropertyId, Propert
|
||||
return serialized_properties;
|
||||
}
|
||||
|
||||
std::string TextIndex::CopyPropertyValuesToString(const std::map<PropertyId, PropertyValue> &properties) {
|
||||
std::vector<std::string> indexable_properties_as_string;
|
||||
for (const auto &[_, prop_value] : properties) {
|
||||
switch (prop_value.type()) {
|
||||
case PropertyValue::Type::Bool:
|
||||
indexable_properties_as_string.push_back(prop_value.ValueBool() ? "true" : "false");
|
||||
break;
|
||||
case PropertyValue::Type::Int:
|
||||
indexable_properties_as_string.push_back(std::to_string(prop_value.ValueInt()));
|
||||
break;
|
||||
case PropertyValue::Type::Double:
|
||||
indexable_properties_as_string.push_back(std::to_string(prop_value.ValueDouble()));
|
||||
break;
|
||||
case PropertyValue::Type::String:
|
||||
indexable_properties_as_string.push_back(prop_value.ValueString());
|
||||
break;
|
||||
case PropertyValue::Type::Null:
|
||||
case PropertyValue::Type::List:
|
||||
case PropertyValue::Type::Map:
|
||||
case PropertyValue::Type::TemporalData:
|
||||
default:
|
||||
continue;
|
||||
}
|
||||
}
|
||||
return utils::Join(indexable_properties_as_string, " ");
|
||||
}
|
||||
|
||||
std::vector<mgcxx::text_search::Context *> TextIndex::GetApplicableTextIndices(const std::vector<LabelId> &labels) {
|
||||
if (!flags::AreExperimentsEnabled(flags::Experiments::TEXT_SEARCH)) {
|
||||
throw query::TextSearchDisabledException();
|
||||
@ -93,11 +121,13 @@ std::vector<mgcxx::text_search::Context *> TextIndex::GetApplicableTextIndices(c
|
||||
}
|
||||
|
||||
void TextIndex::LoadNodeToTextIndices(const std::int64_t gid, const nlohmann::json &properties,
|
||||
const std::string &all_property_values_string,
|
||||
const std::vector<mgcxx::text_search::Context *> &applicable_text_indices) {
|
||||
// NOTE: Text indexes are presently all-property indices. If we allow text indexes restricted to specific properties,
|
||||
// an indexable document should be created for each applicable index.
|
||||
nlohmann::json document = {};
|
||||
document["data"] = properties;
|
||||
document["all"] = all_property_values_string;
|
||||
document["metadata"] = {};
|
||||
document["metadata"]["gid"] = gid;
|
||||
document["metadata"]["deleted"] = false;
|
||||
@ -135,7 +165,7 @@ void TextIndex::AddNode(Vertex *vertex_after_update, NameIdMapper *name_id_mappe
|
||||
|
||||
auto vertex_properties = vertex_after_update->properties.Properties();
|
||||
LoadNodeToTextIndices(vertex_after_update->gid.AsInt(), SerializeProperties(vertex_properties, name_id_mapper),
|
||||
applicable_text_indices);
|
||||
CopyPropertyValuesToString(vertex_properties), applicable_text_indices);
|
||||
}
|
||||
|
||||
void TextIndex::AddNode(Vertex *vertex_after_update, NameIdMapper *name_id_mapper) {
|
||||
@ -203,7 +233,7 @@ void TextIndex::CreateIndex(const std::string &index_name, LabelId label, memgra
|
||||
|
||||
auto vertex_properties = v.Properties(View::NEW).GetValue();
|
||||
LoadNodeToTextIndices(v.Gid().AsInt(), SerializeProperties(vertex_properties, db),
|
||||
{&index_.at(index_name).context_});
|
||||
CopyPropertyValuesToString(vertex_properties), {&index_.at(index_name).context_});
|
||||
}
|
||||
|
||||
CommitLoadedNodes(index_.at(index_name).context_);
|
||||
@ -221,7 +251,7 @@ void TextIndex::RecoverIndex(const std::string &index_name, LabelId label,
|
||||
nlohmann::json document = {};
|
||||
auto vertex_properties = v.properties.Properties();
|
||||
LoadNodeToTextIndices(v.gid.AsInt(), SerializeProperties(vertex_properties, name_id_mapper),
|
||||
{&index_.at(index_name).context_});
|
||||
CopyPropertyValuesToString(vertex_properties), {&index_.at(index_name).context_});
|
||||
}
|
||||
|
||||
CommitLoadedNodes(index_.at(index_name).context_);
|
||||
@ -251,7 +281,57 @@ LabelId TextIndex::DropIndex(const std::string &index_name) {
|
||||
|
||||
bool TextIndex::IndexExists(const std::string &index_name) const { return index_.contains(index_name); }
|
||||
|
||||
std::vector<Gid> TextIndex::Search(const std::string &index_name, const std::string &search_query) {
|
||||
mgcxx::text_search::SearchOutput TextIndex::TQLSearch(const std::string &index_name, const std::string &search_query) {
|
||||
auto input = mgcxx::text_search::SearchInput{.search_query = search_query, .return_fields = {"data", "metadata"}};
|
||||
// // Basic check for search fields in the query (Tantivy syntax delimits them with a `:` to the right)
|
||||
// if (search_query.find(":") == std::string::npos) {
|
||||
// input.search_fields = {"data"};
|
||||
// }
|
||||
|
||||
mgcxx::text_search::SearchOutput search_results;
|
||||
|
||||
try {
|
||||
search_results = mgcxx::text_search::search(index_.at(index_name).context_, input);
|
||||
} catch (const std::exception &e) {
|
||||
throw query::TextSearchException("Tantivy error: {}", e.what());
|
||||
}
|
||||
|
||||
return search_results;
|
||||
}
|
||||
|
||||
mgcxx::text_search::SearchOutput TextIndex::RegexSearch(const std::string &index_name,
|
||||
const std::string &search_query) {
|
||||
auto input = mgcxx::text_search::SearchInput{
|
||||
.search_fields = {"all"}, .search_query = search_query, .return_fields = {"metadata"}};
|
||||
mgcxx::text_search::SearchOutput search_results;
|
||||
|
||||
try {
|
||||
search_results = mgcxx::text_search::regex_search(index_.at(index_name).context_, input);
|
||||
} catch (const std::exception &e) {
|
||||
throw query::TextSearchException("Tantivy error: {}", e.what());
|
||||
}
|
||||
|
||||
return search_results;
|
||||
}
|
||||
|
||||
mgcxx::text_search::SearchOutput TextIndex::SearchAllProperties(const std::string &index_name,
|
||||
const std::string &search_query) {
|
||||
auto input = mgcxx::text_search::SearchInput{
|
||||
.search_fields = {"all"}, .search_query = search_query, .return_fields = {"metadata"}};
|
||||
|
||||
mgcxx::text_search::SearchOutput search_results;
|
||||
|
||||
try {
|
||||
search_results = mgcxx::text_search::search(index_.at(index_name).context_, input);
|
||||
} catch (const std::exception &e) {
|
||||
throw query::TextSearchException("Tantivy error: {}", e.what());
|
||||
}
|
||||
|
||||
return search_results;
|
||||
}
|
||||
|
||||
std::vector<Gid> TextIndex::Search(const std::string &index_name, const std::string &search_query,
|
||||
const std::string &search_mode) {
|
||||
if (!flags::AreExperimentsEnabled(flags::Experiments::TEXT_SEARCH)) {
|
||||
throw query::TextSearchDisabledException();
|
||||
}
|
||||
@ -260,20 +340,18 @@ std::vector<Gid> TextIndex::Search(const std::string &index_name, const std::str
|
||||
throw query::TextSearchException("Text index \"{}\" doesn’t exist.", index_name);
|
||||
}
|
||||
|
||||
auto input = mgcxx::text_search::SearchInput{.search_query = search_query, .return_fields = {"data", "metadata"}};
|
||||
// Basic check for search fields in the query (Tantivy syntax delimits them with a `:` to the right)
|
||||
if (search_query.find(":") == std::string::npos) {
|
||||
input.search_fields = {"data"};
|
||||
mgcxx::text_search::SearchOutput search_results;
|
||||
if (search_mode == "specify_property") {
|
||||
search_results = TQLSearch(index_name, search_query);
|
||||
} else if (search_mode == "regex") {
|
||||
search_results = RegexSearch(index_name, search_query);
|
||||
} else if (search_mode == "all_properties") {
|
||||
search_results = SearchAllProperties(index_name, search_query);
|
||||
} else {
|
||||
throw query::TextSearchException("Unsupported search type"); // TODO improve
|
||||
}
|
||||
|
||||
std::vector<Gid> found_nodes;
|
||||
mgcxx::text_search::SearchOutput search_results;
|
||||
|
||||
try {
|
||||
search_results = mgcxx::text_search::search(index_.at(index_name).context_, input);
|
||||
} catch (const std::exception &e) {
|
||||
throw query::TextSearchException("Tantivy error: {}", e.what());
|
||||
}
|
||||
for (const auto &doc : search_results.docs) {
|
||||
// The CXX .data() method (https://cxx.rs/binding/string.html) may overestimate string length, causing JSON parsing
|
||||
// errors downstream. We prevent this by resizing the converted string with the correctly-working .length() method.
|
||||
|
@ -36,9 +36,12 @@ class TextIndex {
|
||||
template <typename T>
|
||||
nlohmann::json SerializeProperties(const std::map<PropertyId, PropertyValue> &properties, T *name_resolver);
|
||||
|
||||
std::string CopyPropertyValuesToString(const std::map<PropertyId, PropertyValue> &properties);
|
||||
|
||||
std::vector<mgcxx::text_search::Context *> GetApplicableTextIndices(const std::vector<LabelId> &labels);
|
||||
|
||||
void LoadNodeToTextIndices(const std::int64_t gid, const nlohmann::json &properties,
|
||||
const std::string &all_property_values_string,
|
||||
const std::vector<mgcxx::text_search::Context *> &applicable_text_indices);
|
||||
|
||||
void CommitLoadedNodes(mgcxx::text_search::Context &index_context);
|
||||
@ -48,6 +51,12 @@ class TextIndex {
|
||||
|
||||
void RemoveNode(Vertex *vertex, const std::vector<mgcxx::text_search::Context *> &applicable_text_indices);
|
||||
|
||||
mgcxx::text_search::SearchOutput TQLSearch(const std::string &index_name, const std::string &search_query);
|
||||
|
||||
mgcxx::text_search::SearchOutput RegexSearch(const std::string &index_name, const std::string &search_query);
|
||||
|
||||
mgcxx::text_search::SearchOutput SearchAllProperties(const std::string &index_name, const std::string &search_query);
|
||||
|
||||
public:
|
||||
TextIndex() = default;
|
||||
|
||||
@ -76,7 +85,8 @@ class TextIndex {
|
||||
|
||||
bool IndexExists(const std::string &index_name) const;
|
||||
|
||||
std::vector<Gid> Search(const std::string &index_name, const std::string &search_query);
|
||||
std::vector<Gid> Search(const std::string &index_name, const std::string &search_query,
|
||||
const std::string &search_mode);
|
||||
|
||||
void Commit();
|
||||
|
||||
|
@ -244,8 +244,9 @@ class Storage {
|
||||
storage_->indices_.text_index_.UpdateNode(vertex.vertex_, storage_->name_id_mapper_.get(), removed_labels);
|
||||
}
|
||||
|
||||
std::vector<Gid> TextIndexSearch(const std::string &index_name, const std::string &search_query) const {
|
||||
return storage_->indices_.text_index_.Search(index_name, search_query);
|
||||
std::vector<Gid> TextIndexSearch(const std::string &index_name, const std::string &search_query,
|
||||
const std::string &search_mode) const {
|
||||
return storage_->indices_.text_index_.Search(index_name, search_query, search_mode);
|
||||
}
|
||||
|
||||
virtual IndicesInfo ListAllIndices() const = 0;
|
||||
|
@ -50,7 +50,7 @@ def memgraph_with_text_indexed_data(**kwargs) -> Memgraph:
|
||||
memgraph = Memgraph()
|
||||
|
||||
memgraph.execute(
|
||||
"""CREATE (:Document {title: "Rules2024", version: 1, fulltext: "random fulltext", date: date("2023-11-14")});"""
|
||||
"""CREATE (:Document {title: "Rules2023", version: 1, fulltext: "Rules2024", date: date("2023-11-14")});"""
|
||||
)
|
||||
memgraph.execute(
|
||||
"""CREATE (:Document:Revision {title: "Rules2024", version: 2, fulltext: "random words", date: date("2023-12-15")});"""
|
||||
|
@ -59,6 +59,34 @@ def test_text_search_given_property(memgraph_with_text_indexed_data):
|
||||
assert len(result) == 2 and result == [{"title": "Rules2024", "version": 1}, {"title": "Rules2024", "version": 2}]
|
||||
|
||||
|
||||
def test_text_search_all_properties(memgraph_with_text_indexed_data):
|
||||
result = list(
|
||||
memgraph_with_text_indexed_data.execute_and_fetch(
|
||||
"""CALL text_search.search_all("complianceDocuments", "Rules2024") YIELD node
|
||||
RETURN node
|
||||
ORDER BY node.version ASC, node.title ASC;"""
|
||||
)
|
||||
)
|
||||
|
||||
print(result)
|
||||
|
||||
assert True
|
||||
|
||||
|
||||
def test_text_search_regex(memgraph_with_text_indexed_data):
|
||||
result = list(
|
||||
memgraph_with_text_indexed_data.execute_and_fetch(
|
||||
"""CALL text_search.regex_search("complianceDocuments", "Rules*") YIELD node
|
||||
RETURN node
|
||||
ORDER BY node.version ASC, node.title ASC;"""
|
||||
)
|
||||
)
|
||||
|
||||
print(result)
|
||||
|
||||
assert True
|
||||
|
||||
|
||||
def test_text_search_query_boolean(memgraph_with_text_indexed_data):
|
||||
BOOLEAN_QUERY = """CALL text_search.search("complianceDocuments", "(data.title:Rules2023 OR data.title:Rules2024) AND data.fulltext:words") YIELD node
|
||||
RETURN node.title AS title, node.version AS version
|
||||
|
Loading…
Reference in New Issue
Block a user