Add command NULLIF for identifying nulls in LOAD CSV (#914)
Add NULLIF command which turns all row values corresponding to the string to the nullif character sequence.
This commit is contained in:
parent
63f8298033
commit
05cc35bf93
@ -3022,6 +3022,7 @@ class LoadCsv : public memgraph::query::Clause {
|
|||||||
bool ignore_bad_;
|
bool ignore_bad_;
|
||||||
memgraph::query::Expression *delimiter_{nullptr};
|
memgraph::query::Expression *delimiter_{nullptr};
|
||||||
memgraph::query::Expression *quote_{nullptr};
|
memgraph::query::Expression *quote_{nullptr};
|
||||||
|
memgraph::query::Expression *nullif_{nullptr};
|
||||||
memgraph::query::Identifier *row_var_{nullptr};
|
memgraph::query::Identifier *row_var_{nullptr};
|
||||||
|
|
||||||
LoadCsv *Clone(AstStorage *storage) const override {
|
LoadCsv *Clone(AstStorage *storage) const override {
|
||||||
@ -3031,18 +3032,20 @@ class LoadCsv : public memgraph::query::Clause {
|
|||||||
object->ignore_bad_ = ignore_bad_;
|
object->ignore_bad_ = ignore_bad_;
|
||||||
object->delimiter_ = delimiter_ ? delimiter_->Clone(storage) : nullptr;
|
object->delimiter_ = delimiter_ ? delimiter_->Clone(storage) : nullptr;
|
||||||
object->quote_ = quote_ ? quote_->Clone(storage) : nullptr;
|
object->quote_ = quote_ ? quote_->Clone(storage) : nullptr;
|
||||||
|
object->nullif_ = nullif_;
|
||||||
object->row_var_ = row_var_ ? row_var_->Clone(storage) : nullptr;
|
object->row_var_ = row_var_ ? row_var_->Clone(storage) : nullptr;
|
||||||
return object;
|
return object;
|
||||||
}
|
}
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
explicit LoadCsv(Expression *file, bool with_header, bool ignore_bad, Expression *delimiter, Expression *quote,
|
explicit LoadCsv(Expression *file, bool with_header, bool ignore_bad, Expression *delimiter, Expression *quote,
|
||||||
Identifier *row_var)
|
Expression *nullif, Identifier *row_var)
|
||||||
: file_(file),
|
: file_(file),
|
||||||
with_header_(with_header),
|
with_header_(with_header),
|
||||||
ignore_bad_(ignore_bad),
|
ignore_bad_(ignore_bad),
|
||||||
delimiter_(delimiter),
|
delimiter_(delimiter),
|
||||||
quote_(quote),
|
quote_(quote),
|
||||||
|
nullif_(nullif),
|
||||||
row_var_(row_var) {
|
row_var_(row_var) {
|
||||||
DMG_ASSERT(row_var, "LoadCsv cannot take nullptr for identifier");
|
DMG_ASSERT(row_var, "LoadCsv cannot take nullptr for identifier");
|
||||||
}
|
}
|
||||||
|
@ -362,6 +362,11 @@ antlrcpp::Any CypherMainVisitor::visitLoadCsv(MemgraphCypher::LoadCsvContext *ct
|
|||||||
// handle skip bad row option
|
// handle skip bad row option
|
||||||
load_csv->ignore_bad_ = ctx->IGNORE() && ctx->BAD();
|
load_csv->ignore_bad_ = ctx->IGNORE() && ctx->BAD();
|
||||||
|
|
||||||
|
// handle character sequence which will correspond to nulls
|
||||||
|
if (ctx->NULLIF()) {
|
||||||
|
load_csv->nullif_ = std::any_cast<Expression *>(ctx->nullif()->accept(this));
|
||||||
|
}
|
||||||
|
|
||||||
// handle delimiter
|
// handle delimiter
|
||||||
if (ctx->DELIMITER()) {
|
if (ctx->DELIMITER()) {
|
||||||
if (ctx->delimiter()->literal()->StringLiteral()) {
|
if (ctx->delimiter()->literal()->StringLiteral()) {
|
||||||
|
@ -59,6 +59,7 @@ memgraphCypherKeyword : cypherKeyword
|
|||||||
| GRANT
|
| GRANT
|
||||||
| HEADER
|
| HEADER
|
||||||
| IDENTIFIED
|
| IDENTIFIED
|
||||||
|
| NULLIF
|
||||||
| ISOLATION
|
| ISOLATION
|
||||||
| IN_MEMORY_ANALYTICAL
|
| IN_MEMORY_ANALYTICAL
|
||||||
| IN_MEMORY_TRANSACTIONAL
|
| IN_MEMORY_TRANSACTIONAL
|
||||||
@ -224,6 +225,7 @@ loadCsv : LOAD CSV FROM csvFile ( WITH | NO ) HEADER
|
|||||||
( IGNORE BAD ) ?
|
( IGNORE BAD ) ?
|
||||||
( DELIMITER delimiter ) ?
|
( DELIMITER delimiter ) ?
|
||||||
( QUOTE quote ) ?
|
( QUOTE quote ) ?
|
||||||
|
( NULLIF nullif ) ?
|
||||||
AS rowVar ;
|
AS rowVar ;
|
||||||
|
|
||||||
csvFile : literal ;
|
csvFile : literal ;
|
||||||
@ -232,6 +234,8 @@ delimiter : literal ;
|
|||||||
|
|
||||||
quote : literal ;
|
quote : literal ;
|
||||||
|
|
||||||
|
nullif : literal ;
|
||||||
|
|
||||||
rowVar : variable ;
|
rowVar : variable ;
|
||||||
|
|
||||||
userOrRoleName : symbolicName ;
|
userOrRoleName : symbolicName ;
|
||||||
|
@ -85,6 +85,7 @@ MODULE_WRITE : M O D U L E UNDERSCORE W R I T E ;
|
|||||||
NEXT : N E X T ;
|
NEXT : N E X T ;
|
||||||
NO : N O ;
|
NO : N O ;
|
||||||
NOTHING : N O T H I N G ;
|
NOTHING : N O T H I N G ;
|
||||||
|
NULLIF : N U L L I F ;
|
||||||
PASSWORD : P A S S W O R D ;
|
PASSWORD : P A S S W O R D ;
|
||||||
PORT : P O R T ;
|
PORT : P O R T ;
|
||||||
PRIVILEGES : P R I V I L E G E S ;
|
PRIVILEGES : P R I V I L E G E S ;
|
||||||
|
@ -4637,13 +4637,14 @@ UniqueCursorPtr CallProcedure::MakeCursor(utils::MemoryResource *mem) const {
|
|||||||
}
|
}
|
||||||
|
|
||||||
LoadCsv::LoadCsv(std::shared_ptr<LogicalOperator> input, Expression *file, bool with_header, bool ignore_bad,
|
LoadCsv::LoadCsv(std::shared_ptr<LogicalOperator> input, Expression *file, bool with_header, bool ignore_bad,
|
||||||
Expression *delimiter, Expression *quote, Symbol row_var)
|
Expression *delimiter, Expression *quote, Expression *nullif, Symbol row_var)
|
||||||
: input_(input ? input : (std::make_shared<Once>())),
|
: input_(input ? input : (std::make_shared<Once>())),
|
||||||
file_(file),
|
file_(file),
|
||||||
with_header_(with_header),
|
with_header_(with_header),
|
||||||
ignore_bad_(ignore_bad),
|
ignore_bad_(ignore_bad),
|
||||||
delimiter_(delimiter),
|
delimiter_(delimiter),
|
||||||
quote_(quote),
|
quote_(quote),
|
||||||
|
nullif_(nullif),
|
||||||
row_var_(row_var) {
|
row_var_(row_var) {
|
||||||
MG_ASSERT(file_, "Something went wrong - '{}' member file_ shouldn't be a nullptr", __func__);
|
MG_ASSERT(file_, "Something went wrong - '{}' member file_ shouldn't be a nullptr", __func__);
|
||||||
}
|
}
|
||||||
@ -4674,22 +4675,31 @@ auto ToOptionalString(ExpressionEvaluator *evaluator, Expression *expression) ->
|
|||||||
return std::nullopt;
|
return std::nullopt;
|
||||||
};
|
};
|
||||||
|
|
||||||
TypedValue CsvRowToTypedList(csv::Reader::Row &row) {
|
TypedValue CsvRowToTypedList(csv::Reader::Row &row, std::optional<utils::pmr::string> &nullif) {
|
||||||
auto *mem = row.get_allocator().GetMemoryResource();
|
auto *mem = row.get_allocator().GetMemoryResource();
|
||||||
auto typed_columns = utils::pmr::vector<TypedValue>(mem);
|
auto typed_columns = utils::pmr::vector<TypedValue>(mem);
|
||||||
typed_columns.reserve(row.size());
|
typed_columns.reserve(row.size());
|
||||||
for (auto &column : row) {
|
for (auto &column : row) {
|
||||||
typed_columns.emplace_back(std::move(column));
|
if (!nullif.has_value() || column != nullif.value()) {
|
||||||
|
typed_columns.emplace_back(std::move(column));
|
||||||
|
} else {
|
||||||
|
typed_columns.emplace_back();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
return {std::move(typed_columns), mem};
|
return {std::move(typed_columns), mem};
|
||||||
}
|
}
|
||||||
|
|
||||||
TypedValue CsvRowToTypedMap(csv::Reader::Row &row, csv::Reader::Header header) {
|
TypedValue CsvRowToTypedMap(csv::Reader::Row &row, csv::Reader::Header header,
|
||||||
|
std::optional<utils::pmr::string> &nullif) {
|
||||||
// a valid row has the same number of elements as the header
|
// a valid row has the same number of elements as the header
|
||||||
auto *mem = row.get_allocator().GetMemoryResource();
|
auto *mem = row.get_allocator().GetMemoryResource();
|
||||||
utils::pmr::map<utils::pmr::string, TypedValue> m(mem);
|
utils::pmr::map<utils::pmr::string, TypedValue> m(mem);
|
||||||
for (auto i = 0; i < row.size(); ++i) {
|
for (auto i = 0; i < row.size(); ++i) {
|
||||||
m.emplace(std::move(header[i]), std::move(row[i]));
|
if (!nullif.has_value() || row[i] != nullif.value()) {
|
||||||
|
m.emplace(std::move(header[i]), std::move(row[i]));
|
||||||
|
} else {
|
||||||
|
m.emplace(std::piecewise_construct, std::forward_as_tuple(std::move(header[i])), std::forward_as_tuple());
|
||||||
|
}
|
||||||
}
|
}
|
||||||
return {std::move(m), mem};
|
return {std::move(m), mem};
|
||||||
}
|
}
|
||||||
@ -4701,6 +4711,7 @@ class LoadCsvCursor : public Cursor {
|
|||||||
const UniqueCursorPtr input_cursor_;
|
const UniqueCursorPtr input_cursor_;
|
||||||
bool did_pull_;
|
bool did_pull_;
|
||||||
std::optional<csv::Reader> reader_{};
|
std::optional<csv::Reader> reader_{};
|
||||||
|
std::optional<utils::pmr::string> nullif_;
|
||||||
|
|
||||||
public:
|
public:
|
||||||
LoadCsvCursor(const LoadCsv *self, utils::MemoryResource *mem)
|
LoadCsvCursor(const LoadCsv *self, utils::MemoryResource *mem)
|
||||||
@ -4718,6 +4729,7 @@ class LoadCsvCursor : public Cursor {
|
|||||||
// without massacring the code even worse than I did here
|
// without massacring the code even worse than I did here
|
||||||
if (UNLIKELY(!reader_)) {
|
if (UNLIKELY(!reader_)) {
|
||||||
reader_ = MakeReader(&context.evaluation_context);
|
reader_ = MakeReader(&context.evaluation_context);
|
||||||
|
nullif_ = ParseNullif(&context.evaluation_context);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (input_cursor_->Pull(frame, context)) {
|
if (input_cursor_->Pull(frame, context)) {
|
||||||
@ -4733,10 +4745,10 @@ class LoadCsvCursor : public Cursor {
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
if (!reader_->HasHeader()) {
|
if (!reader_->HasHeader()) {
|
||||||
frame[self_->row_var_] = CsvRowToTypedList(*row);
|
frame[self_->row_var_] = CsvRowToTypedList(*row, nullif_);
|
||||||
} else {
|
} else {
|
||||||
frame[self_->row_var_] =
|
frame[self_->row_var_] =
|
||||||
CsvRowToTypedMap(*row, csv::Reader::Header(reader_->GetHeader(), context.evaluation_context.memory));
|
CsvRowToTypedMap(*row, csv::Reader::Header(reader_->GetHeader(), context.evaluation_context.memory), nullif_);
|
||||||
}
|
}
|
||||||
if (context.frame_change_collector && context.frame_change_collector->IsKeyTracked(self_->row_var_.name())) {
|
if (context.frame_change_collector && context.frame_change_collector->IsKeyTracked(self_->row_var_.name())) {
|
||||||
context.frame_change_collector->ResetTrackingValue(self_->row_var_.name());
|
context.frame_change_collector->ResetTrackingValue(self_->row_var_.name());
|
||||||
@ -4768,6 +4780,15 @@ class LoadCsvCursor : public Cursor {
|
|||||||
csv::Reader::Config(self_->with_header_, self_->ignore_bad_, std::move(maybe_delim), std::move(maybe_quote)),
|
csv::Reader::Config(self_->with_header_, self_->ignore_bad_, std::move(maybe_delim), std::move(maybe_quote)),
|
||||||
utils::NewDeleteResource());
|
utils::NewDeleteResource());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
std::optional<utils::pmr::string> ParseNullif(EvaluationContext *eval_context) {
|
||||||
|
Frame frame(0);
|
||||||
|
SymbolTable symbol_table;
|
||||||
|
DbAccessor *dba = nullptr;
|
||||||
|
auto evaluator = ExpressionEvaluator(&frame, symbol_table, *eval_context, dba, storage::View::OLD);
|
||||||
|
|
||||||
|
return ToOptionalString(&evaluator, self_->nullif_);
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
UniqueCursorPtr LoadCsv::MakeCursor(utils::MemoryResource *mem) const {
|
UniqueCursorPtr LoadCsv::MakeCursor(utils::MemoryResource *mem) const {
|
||||||
|
@ -2227,7 +2227,7 @@ class LoadCsv : public memgraph::query::plan::LogicalOperator {
|
|||||||
|
|
||||||
LoadCsv() = default;
|
LoadCsv() = default;
|
||||||
LoadCsv(std::shared_ptr<LogicalOperator> input, Expression *file, bool with_header, bool ignore_bad,
|
LoadCsv(std::shared_ptr<LogicalOperator> input, Expression *file, bool with_header, bool ignore_bad,
|
||||||
Expression *delimiter, Expression *quote, Symbol row_var);
|
Expression *delimiter, Expression *quote, Expression *nullif, Symbol row_var);
|
||||||
bool Accept(HierarchicalLogicalOperatorVisitor &visitor) override;
|
bool Accept(HierarchicalLogicalOperatorVisitor &visitor) override;
|
||||||
UniqueCursorPtr MakeCursor(utils::MemoryResource *) const override;
|
UniqueCursorPtr MakeCursor(utils::MemoryResource *) const override;
|
||||||
std::vector<Symbol> OutputSymbols(const SymbolTable &) const override;
|
std::vector<Symbol> OutputSymbols(const SymbolTable &) const override;
|
||||||
@ -2243,6 +2243,7 @@ class LoadCsv : public memgraph::query::plan::LogicalOperator {
|
|||||||
bool ignore_bad_;
|
bool ignore_bad_;
|
||||||
Expression *delimiter_{nullptr};
|
Expression *delimiter_{nullptr};
|
||||||
Expression *quote_{nullptr};
|
Expression *quote_{nullptr};
|
||||||
|
Expression *nullif_{nullptr};
|
||||||
Symbol row_var_;
|
Symbol row_var_;
|
||||||
|
|
||||||
std::unique_ptr<LogicalOperator> Clone(AstStorage *storage) const override {
|
std::unique_ptr<LogicalOperator> Clone(AstStorage *storage) const override {
|
||||||
@ -2253,6 +2254,7 @@ class LoadCsv : public memgraph::query::plan::LogicalOperator {
|
|||||||
object->ignore_bad_ = ignore_bad_;
|
object->ignore_bad_ = ignore_bad_;
|
||||||
object->delimiter_ = delimiter_ ? delimiter_->Clone(storage) : nullptr;
|
object->delimiter_ = delimiter_ ? delimiter_->Clone(storage) : nullptr;
|
||||||
object->quote_ = quote_ ? quote_->Clone(storage) : nullptr;
|
object->quote_ = quote_ ? quote_->Clone(storage) : nullptr;
|
||||||
|
object->nullif_ = nullif_;
|
||||||
object->row_var_ = row_var_;
|
object->row_var_ = row_var_;
|
||||||
return object;
|
return object;
|
||||||
}
|
}
|
||||||
|
@ -895,6 +895,10 @@ bool PlanToJsonVisitor::PreVisit(query::plan::LoadCsv &op) {
|
|||||||
self["quote"] = ToJson(op.quote_);
|
self["quote"] = ToJson(op.quote_);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (op.nullif_) {
|
||||||
|
self["nullif"] = ToJson(op.nullif_);
|
||||||
|
}
|
||||||
|
|
||||||
self["row_variable"] = ToJson(op.row_var_);
|
self["row_variable"] = ToJson(op.row_var_);
|
||||||
|
|
||||||
op.input_->Accept(*this);
|
op.input_->Accept(*this);
|
||||||
|
@ -226,10 +226,9 @@ class RuleBasedPlanner {
|
|||||||
const auto &row_sym = context.symbol_table->at(*load_csv->row_var_);
|
const auto &row_sym = context.symbol_table->at(*load_csv->row_var_);
|
||||||
context.bound_symbols.insert(row_sym);
|
context.bound_symbols.insert(row_sym);
|
||||||
|
|
||||||
input_op =
|
input_op = std::make_unique<plan::LoadCsv>(std::move(input_op), load_csv->file_, load_csv->with_header_,
|
||||||
std::make_unique<plan::LoadCsv>(std::move(input_op), load_csv->file_, load_csv->with_header_,
|
load_csv->ignore_bad_, load_csv->delimiter_, load_csv->quote_,
|
||||||
load_csv->ignore_bad_, load_csv->delimiter_, load_csv->quote_, row_sym);
|
load_csv->nullif_, row_sym);
|
||||||
|
|
||||||
} else if (auto *foreach = utils::Downcast<query::Foreach>(clause)) {
|
} else if (auto *foreach = utils::Downcast<query::Foreach>(clause)) {
|
||||||
context.is_write_query = true;
|
context.is_write_query = true;
|
||||||
input_op = HandleForeachClause(foreach, std::move(input_op), *context.symbol_table, context.bound_symbols,
|
input_op = HandleForeachClause(foreach, std::move(input_op), *context.symbol_table, context.bound_symbols,
|
||||||
|
@ -8,3 +8,6 @@ endfunction()
|
|||||||
|
|
||||||
copy_load_csv_e2e_python_files(load_csv.py)
|
copy_load_csv_e2e_python_files(load_csv.py)
|
||||||
copy_load_csv_e2e_files(simple.csv)
|
copy_load_csv_e2e_files(simple.csv)
|
||||||
|
|
||||||
|
copy_load_csv_e2e_python_files(load_csv_nullif.py)
|
||||||
|
copy_load_csv_e2e_files(nullif.csv)
|
||||||
|
53
tests/e2e/load_csv/load_csv_nullif.py
Normal file
53
tests/e2e/load_csv/load_csv_nullif.py
Normal file
@ -0,0 +1,53 @@
|
|||||||
|
# Copyright 2022 Memgraph Ltd.
|
||||||
|
#
|
||||||
|
# Use of this software is governed by the Business Source License
|
||||||
|
# included in the file licenses/BSL.txt; by using this file, you agree to be bound by the terms of the Business Source
|
||||||
|
# License, and you may not use this file except in compliance with the Business Source License.
|
||||||
|
#
|
||||||
|
# As of the Change Date specified in that file, in accordance with
|
||||||
|
# the Business Source License, use of this software will be governed
|
||||||
|
# by the Apache License, Version 2.0, included in the file
|
||||||
|
# licenses/APL.txt.
|
||||||
|
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
from gqlalchemy import Memgraph
|
||||||
|
|
||||||
|
NULLIF_CSV_FILE = "nullif.csv"
|
||||||
|
|
||||||
|
|
||||||
|
def get_file_path(file: str) -> str:
|
||||||
|
parent_path = Path(__file__).parent.absolute()
|
||||||
|
return os.path.join(parent_path, file)
|
||||||
|
|
||||||
|
|
||||||
|
def test_given_csv_when_nullif_then_all_identical_rows_are_null():
|
||||||
|
memgraph = Memgraph("localhost", 7687)
|
||||||
|
|
||||||
|
results = list(
|
||||||
|
memgraph.execute_and_fetch(
|
||||||
|
f"""LOAD CSV FROM '{get_file_path(NULLIF_CSV_FILE)}'
|
||||||
|
WITH HEADER NULLIF 'N/A' AS row
|
||||||
|
CREATE (n:Person {{name: row.name, age: row.age,
|
||||||
|
percentage: row.percentage, works_in_IT: row.works_in_IT}})
|
||||||
|
RETURN n
|
||||||
|
"""
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
expected_properties = [
|
||||||
|
{"age": "10", "percentage": "15.0", "works_in_IT": "false"},
|
||||||
|
{"name": "John", "percentage": "35.4", "works_in_IT": "false"},
|
||||||
|
{"name": "Milewa", "age": "34", "works_in_IT": "false"},
|
||||||
|
{"name": "Lucas", "age": "50", "percentage": "12.5"},
|
||||||
|
]
|
||||||
|
properties = [result["n"]._properties for result in results]
|
||||||
|
|
||||||
|
assert expected_properties == properties
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
sys.exit(pytest.main([__file__, "-rA"]))
|
5
tests/e2e/load_csv/nullif.csv
Normal file
5
tests/e2e/load_csv/nullif.csv
Normal file
@ -0,0 +1,5 @@
|
|||||||
|
name,age,percentage,works_in_IT
|
||||||
|
N/A,10,15.0,false
|
||||||
|
John,N/A,35.4,false
|
||||||
|
Milewa,34,N/A,false
|
||||||
|
Lucas,50,12.5,N/A
|
|
@ -1,3 +1,10 @@
|
|||||||
|
nullif_cluster: &nullif_cluster
|
||||||
|
cluster:
|
||||||
|
main:
|
||||||
|
args: ["--bolt-port", "7687", "--log-level=TRACE"]
|
||||||
|
log_file: "load_csv_log_file.txt"
|
||||||
|
validation_queries: []
|
||||||
|
|
||||||
load_csv_cluster: &load_csv_cluster
|
load_csv_cluster: &load_csv_cluster
|
||||||
cluster:
|
cluster:
|
||||||
main:
|
main:
|
||||||
@ -9,6 +16,10 @@ load_csv_cluster: &load_csv_cluster
|
|||||||
validation_queries: []
|
validation_queries: []
|
||||||
|
|
||||||
workloads:
|
workloads:
|
||||||
|
- name: "LOAD CSV nullif"
|
||||||
|
binary: "tests/e2e/pytest_runner.sh"
|
||||||
|
args: ["load_csv/load_csv_nullif.py"]
|
||||||
|
<<: *nullif_cluster
|
||||||
- name: "MATCH + LOAD CSV"
|
- name: "MATCH + LOAD CSV"
|
||||||
binary: "tests/e2e/pytest_runner.sh"
|
binary: "tests/e2e/pytest_runner.sh"
|
||||||
args: ["load_csv/load_csv.py"]
|
args: ["load_csv/load_csv.py"]
|
||||||
|
Loading…
Reference in New Issue
Block a user