2023-01-16 01:25:48 +08:00
|
|
|
// Copyright 2023 Memgraph Ltd.
|
2021-10-26 14:53:56 +08:00
|
|
|
//
|
|
|
|
// Use of this software is governed by the Business Source License
|
|
|
|
// included in the file licenses/BSL.txt; by using this file, you agree to be bound by the terms of the Business Source
|
|
|
|
// License, and you may not use this file except in compliance with the Business Source License.
|
|
|
|
//
|
|
|
|
// As of the Change Date specified in that file, in accordance with
|
|
|
|
// the Business Source License, use of this software will be governed
|
|
|
|
// by the Apache License, Version 2.0, included in the file
|
|
|
|
// licenses/APL.txt.
|
|
|
|
|
2021-03-19 00:24:25 +08:00
|
|
|
#include "gmock/gmock.h"
|
|
|
|
#include "gtest/gtest.h"
|
|
|
|
#include "utils/csv_parsing.hpp"
|
|
|
|
|
|
|
|
#include "utils/string.hpp"
|
|
|
|
|
2021-05-18 19:44:29 +08:00
|
|
|
class CsvReaderTest : public ::testing::TestWithParam<const char *> {
|
2021-03-19 00:24:25 +08:00
|
|
|
protected:
|
|
|
|
const std::filesystem::path csv_directory{std::filesystem::temp_directory_path() / "csv_testing"};
|
|
|
|
|
|
|
|
void SetUp() override {
|
|
|
|
Clear();
|
|
|
|
CreateCsvDir();
|
|
|
|
}
|
|
|
|
|
|
|
|
void TearDown() override { Clear(); }
|
|
|
|
|
|
|
|
private:
|
|
|
|
void CreateCsvDir() {
|
|
|
|
if (!std::filesystem::exists(csv_directory)) {
|
|
|
|
std::filesystem::create_directory(csv_directory);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
void Clear() {
|
|
|
|
if (!std::filesystem::exists(csv_directory)) return;
|
|
|
|
std::filesystem::remove_all(csv_directory);
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
namespace {
|
|
|
|
class FileWriter {
|
|
|
|
public:
|
2021-05-18 19:44:29 +08:00
|
|
|
explicit FileWriter(const std::filesystem::path path, std::string newline = "\n") : newline_{std::move(newline)} {
|
|
|
|
stream_.open(path);
|
|
|
|
}
|
2021-03-19 00:24:25 +08:00
|
|
|
|
|
|
|
FileWriter(const FileWriter &) = delete;
|
|
|
|
FileWriter &operator=(const FileWriter &) = delete;
|
|
|
|
|
|
|
|
FileWriter(FileWriter &&) = delete;
|
|
|
|
FileWriter &operator=(FileWriter &&) = delete;
|
|
|
|
|
|
|
|
void Close() { stream_.close(); }
|
|
|
|
|
|
|
|
size_t WriteLine(const std::string_view line) {
|
|
|
|
if (!stream_.is_open()) {
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2021-05-18 19:44:29 +08:00
|
|
|
stream_ << line << newline_;
|
2021-03-19 00:24:25 +08:00
|
|
|
|
|
|
|
// including the newline character
|
|
|
|
return line.size() + 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
private:
|
|
|
|
std::ofstream stream_;
|
2021-05-18 19:44:29 +08:00
|
|
|
std::string newline_;
|
2021-03-19 00:24:25 +08:00
|
|
|
};
|
|
|
|
|
|
|
|
std::string CreateRow(const std::vector<std::string> &columns, const std::string_view delim) {
|
2022-02-22 20:33:45 +08:00
|
|
|
return memgraph::utils::Join(columns, delim);
|
2021-03-19 00:24:25 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
auto ToPmrColumns(const std::vector<std::string> &columns) {
|
2022-02-22 20:33:45 +08:00
|
|
|
memgraph::utils::pmr::vector<memgraph::utils::pmr::string> pmr_columns(memgraph::utils::NewDeleteResource());
|
2021-03-19 00:24:25 +08:00
|
|
|
for (const auto &col : columns) {
|
|
|
|
pmr_columns.emplace_back(col);
|
|
|
|
}
|
|
|
|
return pmr_columns;
|
|
|
|
}
|
|
|
|
|
|
|
|
} // namespace
|
|
|
|
|
2021-05-18 19:44:29 +08:00
|
|
|
TEST_P(CsvReaderTest, CommaDelimiter) {
|
2021-03-19 00:24:25 +08:00
|
|
|
// create a file with a single valid row;
|
|
|
|
const auto filepath = csv_directory / "bla.csv";
|
2021-05-18 19:44:29 +08:00
|
|
|
auto writer = FileWriter(filepath, GetParam());
|
2021-03-19 00:24:25 +08:00
|
|
|
|
|
|
|
const std::vector<std::string> columns{"A", "B", "C"};
|
|
|
|
writer.WriteLine(CreateRow(columns, ","));
|
|
|
|
|
|
|
|
writer.Close();
|
|
|
|
|
2022-02-22 20:33:45 +08:00
|
|
|
memgraph::utils::MemoryResource *mem{memgraph::utils::NewDeleteResource()};
|
2021-03-19 00:24:25 +08:00
|
|
|
|
|
|
|
bool with_header = false;
|
|
|
|
bool ignore_bad = false;
|
2022-02-22 20:33:45 +08:00
|
|
|
memgraph::utils::pmr::string delimiter{",", mem};
|
|
|
|
memgraph::utils::pmr::string quote{"\"", mem};
|
2021-03-19 00:24:25 +08:00
|
|
|
|
2022-02-22 20:33:45 +08:00
|
|
|
memgraph::csv::Reader::Config cfg{with_header, ignore_bad, delimiter, quote};
|
|
|
|
auto reader = memgraph::csv::Reader(filepath, cfg, mem);
|
2021-03-19 00:24:25 +08:00
|
|
|
|
2021-03-24 18:02:55 +08:00
|
|
|
auto parsed_row = reader.GetNextRow(mem);
|
|
|
|
ASSERT_EQ(*parsed_row, ToPmrColumns(columns));
|
2021-03-19 00:24:25 +08:00
|
|
|
}
|
|
|
|
|
2021-05-18 19:44:29 +08:00
|
|
|
TEST_P(CsvReaderTest, SemicolonDelimiter) {
|
2021-03-19 00:24:25 +08:00
|
|
|
const auto filepath = csv_directory / "bla.csv";
|
2021-05-18 19:44:29 +08:00
|
|
|
auto writer = FileWriter(filepath, GetParam());
|
2021-03-19 00:24:25 +08:00
|
|
|
|
2022-02-22 20:33:45 +08:00
|
|
|
memgraph::utils::MemoryResource *mem(memgraph::utils::NewDeleteResource());
|
2021-03-19 00:24:25 +08:00
|
|
|
|
2022-02-22 20:33:45 +08:00
|
|
|
const memgraph::utils::pmr::string delimiter{";", mem};
|
|
|
|
const memgraph::utils::pmr::string quote{"\"", mem};
|
2021-03-19 00:24:25 +08:00
|
|
|
|
|
|
|
const std::vector<std::string> columns{"A", "B", "C"};
|
|
|
|
writer.WriteLine(CreateRow(columns, delimiter));
|
|
|
|
|
|
|
|
writer.Close();
|
|
|
|
|
|
|
|
const bool with_header = false;
|
|
|
|
const bool ignore_bad = false;
|
2022-02-22 20:33:45 +08:00
|
|
|
const memgraph::csv::Reader::Config cfg{with_header, ignore_bad, delimiter, quote};
|
|
|
|
auto reader = memgraph::csv::Reader(filepath, cfg, mem);
|
2021-03-19 00:24:25 +08:00
|
|
|
|
2021-03-24 18:02:55 +08:00
|
|
|
auto parsed_row = reader.GetNextRow(mem);
|
|
|
|
ASSERT_EQ(*parsed_row, ToPmrColumns(columns));
|
2021-03-19 00:24:25 +08:00
|
|
|
}
|
|
|
|
|
2021-05-18 19:44:29 +08:00
|
|
|
TEST_P(CsvReaderTest, SkipBad) {
|
2021-03-19 00:24:25 +08:00
|
|
|
// create a file with invalid first two rows (containing a string with a
|
|
|
|
// missing closing quote);
|
|
|
|
// the last row is valid;
|
|
|
|
const auto filepath = csv_directory / "bla.csv";
|
2021-05-18 19:44:29 +08:00
|
|
|
auto writer = FileWriter(filepath, GetParam());
|
2021-03-19 00:24:25 +08:00
|
|
|
|
2022-02-22 20:33:45 +08:00
|
|
|
memgraph::utils::MemoryResource *mem(memgraph::utils::NewDeleteResource());
|
2021-03-19 00:24:25 +08:00
|
|
|
|
2022-02-22 20:33:45 +08:00
|
|
|
const memgraph::utils::pmr::string delimiter{";", mem};
|
|
|
|
const memgraph::utils::pmr::string quote{"\"", mem};
|
2021-03-19 00:24:25 +08:00
|
|
|
|
|
|
|
const std::vector<std::string> columns_bad{"A", "B", "\"\"C"};
|
|
|
|
writer.WriteLine(CreateRow(columns_bad, delimiter));
|
|
|
|
writer.WriteLine(CreateRow(columns_bad, delimiter));
|
|
|
|
|
|
|
|
const std::vector<std::string> columns_good{"A", "B", "C"};
|
|
|
|
writer.WriteLine(CreateRow(columns_good, delimiter));
|
|
|
|
|
|
|
|
writer.Close();
|
|
|
|
|
|
|
|
{
|
|
|
|
// we set the 'ignore_bad' flag in the read configuration to 'true';
|
|
|
|
// parser's output should be solely the valid row;
|
|
|
|
const bool with_header = false;
|
|
|
|
const bool ignore_bad = true;
|
2022-02-22 20:33:45 +08:00
|
|
|
const memgraph::csv::Reader::Config cfg{with_header, ignore_bad, delimiter, quote};
|
|
|
|
auto reader = memgraph::csv::Reader(filepath, cfg, mem);
|
2021-03-19 00:24:25 +08:00
|
|
|
|
2021-03-24 18:02:55 +08:00
|
|
|
auto parsed_row = reader.GetNextRow(mem);
|
|
|
|
ASSERT_EQ(*parsed_row, ToPmrColumns(columns_good));
|
2021-03-19 00:24:25 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
{
|
|
|
|
// we set the 'ignore_bad' flag in the read configuration to 'false';
|
|
|
|
// an exception must be thrown;
|
|
|
|
const bool with_header = false;
|
|
|
|
const bool ignore_bad = false;
|
2022-02-22 20:33:45 +08:00
|
|
|
const memgraph::csv::Reader::Config cfg{with_header, ignore_bad, delimiter, quote};
|
|
|
|
auto reader = memgraph::csv::Reader(filepath, cfg, mem);
|
2021-03-19 00:24:25 +08:00
|
|
|
|
2022-02-22 20:33:45 +08:00
|
|
|
EXPECT_THROW(reader.GetNextRow(mem), memgraph::csv::CsvReadException);
|
2021-03-19 00:24:25 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2021-05-18 19:44:29 +08:00
|
|
|
TEST_P(CsvReaderTest, AllRowsValid) {
|
2021-03-19 00:24:25 +08:00
|
|
|
// create a file with all rows valid;
|
|
|
|
// parser should return 'std::nullopt'
|
|
|
|
const auto filepath = csv_directory / "bla.csv";
|
2021-05-18 19:44:29 +08:00
|
|
|
auto writer = FileWriter(filepath, GetParam());
|
2021-03-19 00:24:25 +08:00
|
|
|
|
2022-02-22 20:33:45 +08:00
|
|
|
memgraph::utils::MemoryResource *mem(memgraph::utils::NewDeleteResource());
|
2021-03-19 00:24:25 +08:00
|
|
|
|
2022-02-22 20:33:45 +08:00
|
|
|
const memgraph::utils::pmr::string delimiter{",", mem};
|
|
|
|
const memgraph::utils::pmr::string quote{"\"", mem};
|
2021-03-19 00:24:25 +08:00
|
|
|
|
|
|
|
std::vector<std::string> columns{"A", "B", "C"};
|
|
|
|
writer.WriteLine(CreateRow(columns, delimiter));
|
|
|
|
writer.WriteLine(CreateRow(columns, delimiter));
|
|
|
|
writer.WriteLine(CreateRow(columns, delimiter));
|
|
|
|
|
|
|
|
writer.Close();
|
|
|
|
|
|
|
|
const bool with_header = false;
|
|
|
|
const bool ignore_bad = false;
|
2022-02-22 20:33:45 +08:00
|
|
|
const memgraph::csv::Reader::Config cfg{with_header, ignore_bad, delimiter, quote};
|
|
|
|
auto reader = memgraph::csv::Reader(filepath, cfg);
|
2021-03-19 00:24:25 +08:00
|
|
|
|
|
|
|
const auto pmr_columns = ToPmrColumns(columns);
|
2021-03-24 18:02:55 +08:00
|
|
|
while (auto parsed_row = reader.GetNextRow(mem)) {
|
|
|
|
ASSERT_EQ(*parsed_row, pmr_columns);
|
2021-03-19 00:24:25 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2021-05-18 19:44:29 +08:00
|
|
|
TEST_P(CsvReaderTest, SkipAllRows) {
|
2021-03-19 00:24:25 +08:00
|
|
|
// create a file with all rows invalid (containing a string with a missing closing quote);
|
|
|
|
// parser should return 'std::nullopt'
|
|
|
|
const auto filepath = csv_directory / "bla.csv";
|
2021-05-18 19:44:29 +08:00
|
|
|
auto writer = FileWriter(filepath, GetParam());
|
2021-03-19 00:24:25 +08:00
|
|
|
|
2022-02-22 20:33:45 +08:00
|
|
|
memgraph::utils::MemoryResource *mem(memgraph::utils::NewDeleteResource());
|
2021-03-19 00:24:25 +08:00
|
|
|
|
2022-02-22 20:33:45 +08:00
|
|
|
const memgraph::utils::pmr::string delimiter{",", mem};
|
|
|
|
const memgraph::utils::pmr::string quote{"\"", mem};
|
2021-03-19 00:24:25 +08:00
|
|
|
|
|
|
|
const std::vector<std::string> columns_bad{"A", "B", "\"\"C"};
|
|
|
|
writer.WriteLine(CreateRow(columns_bad, delimiter));
|
|
|
|
writer.WriteLine(CreateRow(columns_bad, delimiter));
|
|
|
|
writer.WriteLine(CreateRow(columns_bad, delimiter));
|
|
|
|
|
|
|
|
writer.Close();
|
|
|
|
|
|
|
|
const bool with_header = false;
|
|
|
|
const bool ignore_bad = true;
|
2022-02-22 20:33:45 +08:00
|
|
|
const memgraph::csv::Reader::Config cfg{with_header, ignore_bad, delimiter, quote};
|
|
|
|
auto reader = memgraph::csv::Reader(filepath, cfg);
|
2021-03-19 00:24:25 +08:00
|
|
|
|
2021-03-24 18:02:55 +08:00
|
|
|
auto parsed_row = reader.GetNextRow(mem);
|
2021-03-19 00:24:25 +08:00
|
|
|
ASSERT_EQ(parsed_row, std::nullopt);
|
|
|
|
}
|
|
|
|
|
2021-05-18 19:44:29 +08:00
|
|
|
TEST_P(CsvReaderTest, WithHeader) {
|
2021-03-19 00:24:25 +08:00
|
|
|
const auto filepath = csv_directory / "bla.csv";
|
2021-05-18 19:44:29 +08:00
|
|
|
auto writer = FileWriter(filepath, GetParam());
|
2021-03-19 00:24:25 +08:00
|
|
|
|
2022-02-22 20:33:45 +08:00
|
|
|
memgraph::utils::MemoryResource *mem(memgraph::utils::NewDeleteResource());
|
2021-03-19 00:24:25 +08:00
|
|
|
|
2022-02-22 20:33:45 +08:00
|
|
|
const memgraph::utils::pmr::string delimiter{",", mem};
|
|
|
|
const memgraph::utils::pmr::string quote{"\"", mem};
|
2021-03-19 00:24:25 +08:00
|
|
|
|
|
|
|
const std::vector<std::string> header{"A", "B", "C"};
|
|
|
|
const std::vector<std::string> columns{"1", "2", "3"};
|
|
|
|
writer.WriteLine(CreateRow(header, delimiter));
|
|
|
|
writer.WriteLine(CreateRow(columns, delimiter));
|
|
|
|
writer.WriteLine(CreateRow(columns, delimiter));
|
|
|
|
writer.WriteLine(CreateRow(columns, delimiter));
|
|
|
|
|
|
|
|
writer.Close();
|
|
|
|
|
|
|
|
const bool with_header = true;
|
|
|
|
const bool ignore_bad = false;
|
2022-02-22 20:33:45 +08:00
|
|
|
const memgraph::csv::Reader::Config cfg(with_header, ignore_bad, delimiter, quote);
|
|
|
|
auto reader = memgraph::csv::Reader(filepath, cfg);
|
2021-03-19 00:24:25 +08:00
|
|
|
|
|
|
|
const auto pmr_header = ToPmrColumns(header);
|
|
|
|
ASSERT_EQ(reader.GetHeader(), pmr_header);
|
|
|
|
|
|
|
|
const auto pmr_columns = ToPmrColumns(columns);
|
2021-03-24 18:02:55 +08:00
|
|
|
while (auto parsed_row = reader.GetNextRow(mem)) {
|
|
|
|
ASSERT_EQ(*parsed_row, pmr_columns);
|
2021-03-19 00:24:25 +08:00
|
|
|
}
|
|
|
|
}
|
2021-03-24 18:02:55 +08:00
|
|
|
|
2021-05-18 19:44:29 +08:00
|
|
|
TEST_P(CsvReaderTest, MultilineQuotedString) {
|
2021-03-24 18:02:55 +08:00
|
|
|
// create a file with first row valid and the second row containing a quoted
|
|
|
|
// string spanning two lines;
|
|
|
|
// parser should return two valid rows
|
|
|
|
const auto filepath = csv_directory / "bla.csv";
|
2021-05-18 19:44:29 +08:00
|
|
|
auto writer = FileWriter(filepath, GetParam());
|
2021-03-24 18:02:55 +08:00
|
|
|
|
2022-02-22 20:33:45 +08:00
|
|
|
memgraph::utils::MemoryResource *mem(memgraph::utils::NewDeleteResource());
|
2021-03-24 18:02:55 +08:00
|
|
|
|
2022-02-22 20:33:45 +08:00
|
|
|
const memgraph::utils::pmr::string delimiter{",", mem};
|
|
|
|
const memgraph::utils::pmr::string quote{"\"", mem};
|
2021-03-24 18:02:55 +08:00
|
|
|
|
|
|
|
const std::vector<std::string> first_row{"A", "B", "C"};
|
|
|
|
const std::vector<std::string> multiline_first{"D", "\"E", "\"\"F"};
|
|
|
|
const std::vector<std::string> multiline_second{"G\"", "H"};
|
|
|
|
|
|
|
|
writer.WriteLine(CreateRow(first_row, delimiter));
|
|
|
|
writer.WriteLine(CreateRow(multiline_first, delimiter));
|
|
|
|
writer.WriteLine(CreateRow(multiline_second, delimiter));
|
|
|
|
|
|
|
|
writer.Close();
|
|
|
|
|
|
|
|
const bool with_header = false;
|
|
|
|
const bool ignore_bad = true;
|
2022-02-22 20:33:45 +08:00
|
|
|
const memgraph::csv::Reader::Config cfg{with_header, ignore_bad, delimiter, quote};
|
|
|
|
auto reader = memgraph::csv::Reader(filepath, cfg);
|
2021-03-24 18:02:55 +08:00
|
|
|
|
|
|
|
auto parsed_row = reader.GetNextRow(mem);
|
|
|
|
ASSERT_EQ(*parsed_row, ToPmrColumns(first_row));
|
|
|
|
|
|
|
|
const std::vector<std::string> expected_multiline{"D", "E,\"FG", "H"};
|
|
|
|
parsed_row = reader.GetNextRow(mem);
|
|
|
|
ASSERT_EQ(*parsed_row, ToPmrColumns(expected_multiline));
|
|
|
|
}
|
2021-03-27 16:47:41 +08:00
|
|
|
|
2021-05-18 19:44:29 +08:00
|
|
|
TEST_P(CsvReaderTest, EmptyColumns) {
|
2021-03-27 16:47:41 +08:00
|
|
|
// create a file with all rows valid;
|
|
|
|
// parser should return 'std::nullopt'
|
|
|
|
const auto filepath = csv_directory / "bla.csv";
|
2021-05-18 19:44:29 +08:00
|
|
|
auto writer = FileWriter(filepath, GetParam());
|
2021-03-27 16:47:41 +08:00
|
|
|
|
2022-02-22 20:33:45 +08:00
|
|
|
memgraph::utils::MemoryResource *mem(memgraph::utils::NewDeleteResource());
|
2021-03-27 16:47:41 +08:00
|
|
|
|
2022-02-22 20:33:45 +08:00
|
|
|
const memgraph::utils::pmr::string delimiter{",", mem};
|
|
|
|
const memgraph::utils::pmr::string quote{"\"", mem};
|
2021-03-27 16:47:41 +08:00
|
|
|
|
|
|
|
std::vector<std::vector<std::string>> expected_rows{{"", "B", "C"}, {"A", "", "C"}, {"A", "B", ""}};
|
|
|
|
|
|
|
|
for (const auto &row : expected_rows) {
|
|
|
|
writer.WriteLine(CreateRow(row, delimiter));
|
|
|
|
}
|
|
|
|
|
|
|
|
writer.Close();
|
|
|
|
|
|
|
|
const bool with_header = false;
|
|
|
|
const bool ignore_bad = false;
|
2022-02-22 20:33:45 +08:00
|
|
|
const memgraph::csv::Reader::Config cfg{with_header, ignore_bad, delimiter, quote};
|
|
|
|
auto reader = memgraph::csv::Reader(filepath, cfg);
|
2021-03-27 16:47:41 +08:00
|
|
|
|
|
|
|
for (const auto &expected_row : expected_rows) {
|
|
|
|
const auto pmr_expected_row = ToPmrColumns(expected_row);
|
|
|
|
const auto parsed_row = reader.GetNextRow(mem);
|
|
|
|
ASSERT_TRUE(parsed_row.has_value());
|
|
|
|
ASSERT_EQ(*parsed_row, pmr_expected_row);
|
|
|
|
}
|
|
|
|
}
|
2021-05-18 19:44:29 +08:00
|
|
|
|
2023-01-16 01:25:48 +08:00
|
|
|
INSTANTIATE_TEST_SUITE_P(NewlineParameterizedTest, CsvReaderTest, ::testing::Values("\n", "\r\n"));
|