parent
dc5eb4befd
commit
f6d5f576d5
227
src/utils/csv_parsing.cpp
Normal file
227
src/utils/csv_parsing.cpp
Normal file
@ -0,0 +1,227 @@
|
||||
#include "utils/csv_parsing.hpp"
|
||||
|
||||
#include <string_view>
|
||||
|
||||
#include "utils/file.hpp"
|
||||
#include "utils/string.hpp"
|
||||
|
||||
namespace csv {
|
||||
|
||||
using ParseError = Reader::ParseError;
|
||||
|
||||
void Reader::InitializeStream() {
|
||||
if (!std::filesystem::exists(path_)) {
|
||||
throw CsvReadException("CSV file not found: {}", path_.string());
|
||||
}
|
||||
csv_stream_.open(path_);
|
||||
if (!csv_stream_.good()) {
|
||||
throw CsvReadException("CSV file {} couldn't be opened!", path_.string());
|
||||
}
|
||||
}
|
||||
|
||||
std::optional<std::string> Reader::GetNextLine() {
|
||||
std::string line;
|
||||
if (!std::getline(csv_stream_, line)) {
|
||||
// reached end of file or an I/0 error occurred
|
||||
if (!csv_stream_.good()) {
|
||||
csv_stream_.close();
|
||||
}
|
||||
return std::nullopt;
|
||||
}
|
||||
++line_count_;
|
||||
return line;
|
||||
}
|
||||
|
||||
std::optional<Reader::Header> Reader::ParseHeader() {
|
||||
// header must be the very first line in the file
|
||||
MG_ASSERT(line_count_ == 1, fmt::format("Invalid use of {}", __func__));
|
||||
const auto maybe_line = GetNextLine();
|
||||
if (!maybe_line) {
|
||||
throw CsvReadException("CSV file {} empty!", path_);
|
||||
}
|
||||
Header header;
|
||||
// set the 'number_of_fields_' once this method is implemented fully
|
||||
return std::nullopt;
|
||||
}
|
||||
|
||||
namespace {
|
||||
enum class CsvParserState : uint8_t {
|
||||
INITIAL_FIELD,
|
||||
NEXT_FIELD,
|
||||
QUOTING,
|
||||
NOT_QUOTING,
|
||||
EXPECT_DELIMITER,
|
||||
};
|
||||
|
||||
bool SubstringStartsWith(const std::string_view str, size_t pos, const std::string_view what) {
|
||||
return utils::StartsWith(utils::Substr(str, pos), what);
|
||||
}
|
||||
} // namespace
|
||||
|
||||
Reader::ParsingResult Reader::ParseRow() {
|
||||
std::vector<std::string> row;
|
||||
std::string column;
|
||||
|
||||
auto state = CsvParserState::INITIAL_FIELD;
|
||||
|
||||
do {
|
||||
const auto maybe_line = GetNextLine();
|
||||
if (!maybe_line) {
|
||||
// The whole file was processed.
|
||||
break;
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < maybe_line->size(); ++i) {
|
||||
const auto c = (*maybe_line)[i];
|
||||
|
||||
// Line feeds and carriage returns are ignored in CSVs.
|
||||
if (c == '\n' || c == '\r') continue;
|
||||
// Null bytes aren't allowed in CSVs.
|
||||
if (c == '\0') {
|
||||
return ParseError(ParseError::ErrorCode::NULL_BYTE,
|
||||
fmt::format("CSV: Line {:d} contains NULL byte", line_count_));
|
||||
}
|
||||
|
||||
switch (state) {
|
||||
case CsvParserState::INITIAL_FIELD:
|
||||
case CsvParserState::NEXT_FIELD: {
|
||||
if (SubstringStartsWith(*maybe_line, i, read_config_.quote)) {
|
||||
// The current field is a quoted field.
|
||||
state = CsvParserState::QUOTING;
|
||||
i += read_config_.quote.size() - 1;
|
||||
} else if (SubstringStartsWith(*maybe_line, i, read_config_.delimiter)) {
|
||||
// The current field has an empty value.
|
||||
row.emplace_back("");
|
||||
state = CsvParserState::NEXT_FIELD;
|
||||
i += read_config_.delimiter.size() - 1;
|
||||
} else {
|
||||
// The current field is a regular field.
|
||||
column.push_back(c);
|
||||
state = CsvParserState::NOT_QUOTING;
|
||||
}
|
||||
break;
|
||||
}
|
||||
case CsvParserState::QUOTING: {
|
||||
auto quote_now = SubstringStartsWith(*maybe_line, i, read_config_.quote);
|
||||
auto quote_next = SubstringStartsWith(*maybe_line, i + read_config_.quote.size(), read_config_.quote);
|
||||
if (quote_now && quote_next) {
|
||||
// This is an escaped quote character.
|
||||
column += read_config_.quote;
|
||||
i += read_config_.quote.size() * 2 - 1;
|
||||
} else if (quote_now && !quote_next) {
|
||||
// This is the end of the quoted field.
|
||||
row.emplace_back(std::move(column));
|
||||
state = CsvParserState::EXPECT_DELIMITER;
|
||||
i += read_config_.quote.size() - 1;
|
||||
} else {
|
||||
column.push_back(c);
|
||||
}
|
||||
break;
|
||||
}
|
||||
case CsvParserState::NOT_QUOTING: {
|
||||
if (SubstringStartsWith(*maybe_line, i, read_config_.delimiter)) {
|
||||
row.emplace_back(std::move(column));
|
||||
state = CsvParserState::NEXT_FIELD;
|
||||
i += read_config_.delimiter.size() - 1;
|
||||
} else {
|
||||
column.push_back(c);
|
||||
}
|
||||
break;
|
||||
}
|
||||
case CsvParserState::EXPECT_DELIMITER: {
|
||||
if (SubstringStartsWith(*maybe_line, i, read_config_.delimiter)) {
|
||||
state = CsvParserState::NEXT_FIELD;
|
||||
i += read_config_.delimiter.size() - 1;
|
||||
} else {
|
||||
return ParseError(ParseError::ErrorCode::UNEXPECTED_TOKEN,
|
||||
fmt::format("CSV Reader: Expected '{}' after '{}', but got '{}'", read_config_.delimiter,
|
||||
read_config_.quote, c));
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
} while (state == CsvParserState::QUOTING);
|
||||
|
||||
switch (state) {
|
||||
case CsvParserState::INITIAL_FIELD: {
|
||||
break;
|
||||
}
|
||||
case CsvParserState::NEXT_FIELD: {
|
||||
row.emplace_back(std::move(column));
|
||||
break;
|
||||
}
|
||||
case CsvParserState::QUOTING: {
|
||||
return ParseError(ParseError::ErrorCode::NO_CLOSING_QUOTE,
|
||||
"There is no more data left to load while inside a quoted string. "
|
||||
"Did you forget to close the quote?");
|
||||
break;
|
||||
}
|
||||
case CsvParserState::NOT_QUOTING: {
|
||||
row.emplace_back(std::move(column));
|
||||
break;
|
||||
}
|
||||
case CsvParserState::EXPECT_DELIMITER: {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// reached the end of file - return empty row
|
||||
if (row.empty()) {
|
||||
return Row(row);
|
||||
}
|
||||
|
||||
// if there's no header, then:
|
||||
// - if we skip bad rows, then the very first __valid__ row will
|
||||
// determine the allowed number of columns
|
||||
// - if we don't skip bad rows, the very first row will determine the allowed
|
||||
// number of columns in all subsequent rows
|
||||
if (!read_config_.with_header && number_of_columns_ == 0) {
|
||||
MG_ASSERT(!row.empty());
|
||||
number_of_columns_ = row.size();
|
||||
}
|
||||
|
||||
if (row.size() != number_of_columns_) {
|
||||
return ParseError(
|
||||
ParseError::ErrorCode::BAD_NUM_OF_COLUMNS,
|
||||
// ToDo(the-joksim):
|
||||
// - 'line_count_ - 1' is the last line of a row (as a
|
||||
// row may span several lines) ==> should have a row
|
||||
// counter
|
||||
fmt::format("Expected {:d} columns in row {:d}, but got {:d}", number_of_columns_, line_count_, row.size()));
|
||||
}
|
||||
|
||||
return Row(row);
|
||||
}
|
||||
|
||||
// Returns Reader::Row if the read row if valid;
|
||||
// Returns std::nullopt if end of file is reached or an error occurred
|
||||
// making it unreadable;
|
||||
// @throws CsvReadException if a bad row is encountered, and the skip_bad is set
|
||||
// to 'true' in the Reader::Config.
|
||||
std::optional<Reader::Row> Reader::GetNextRow() {
|
||||
auto row = ParseRow();
|
||||
|
||||
if (row.HasError()) {
|
||||
if (!read_config_.skip_bad) {
|
||||
throw CsvReadException("CSV Reader: Bad row at line {:d}: {}", line_count_, row.GetError().message);
|
||||
}
|
||||
// try to parse as many times as necessary to reach a valid row
|
||||
do {
|
||||
spdlog::debug("CSV Reader: Bad row at line {:d}: {}", line_count_, row.GetError().message);
|
||||
if (!csv_stream_.good()) {
|
||||
return std::nullopt;
|
||||
}
|
||||
row = ParseRow();
|
||||
} while (row.HasError());
|
||||
}
|
||||
|
||||
auto ret = row.GetValue();
|
||||
if (ret.columns.empty()) {
|
||||
// reached end of file
|
||||
return std::nullopt;
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
} // namespace csv
|
97
src/utils/csv_parsing.hpp
Normal file
97
src/utils/csv_parsing.hpp
Normal file
@ -0,0 +1,97 @@
|
||||
/**
|
||||
* @file
|
||||
*
|
||||
* This file contains utilities for parsing CSV files.
|
||||
*
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cstdint>
|
||||
#include <fstream>
|
||||
#include <optional>
|
||||
#include <filesystem>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "utils/exceptions.hpp"
|
||||
#include "utils/result.hpp"
|
||||
|
||||
namespace csv {
|
||||
|
||||
class CsvReadException : public utils::BasicException {
|
||||
using utils::BasicException::BasicException;
|
||||
};
|
||||
|
||||
class Reader {
|
||||
public:
|
||||
struct Config {
|
||||
Config(){};
|
||||
Config(std::string delimiter, std::string quote, const bool with_header, const bool skip_bad)
|
||||
: delimiter(std::move(delimiter)), quote(std::move(quote)), with_header(with_header), skip_bad(skip_bad) {}
|
||||
|
||||
std::string delimiter{","};
|
||||
std::string quote{"\""};
|
||||
bool with_header{false};
|
||||
bool skip_bad{false};
|
||||
};
|
||||
|
||||
struct Row {
|
||||
Row() = default;
|
||||
explicit Row(std::vector<std::string> cols) : columns(std::move(cols)) {}
|
||||
std::vector<std::string> columns;
|
||||
};
|
||||
|
||||
explicit Reader(const std::filesystem::path &path, const Config cfg = {}) : path_(path), read_config_(cfg) {
|
||||
InitializeStream();
|
||||
if (read_config_.with_header) {
|
||||
header_ = ParseHeader();
|
||||
}
|
||||
}
|
||||
|
||||
Reader(const Reader &) = delete;
|
||||
Reader &operator=(const Reader &) = delete;
|
||||
|
||||
Reader(Reader &&) = delete;
|
||||
Reader &operator=(Reader &&) = delete;
|
||||
|
||||
~Reader() {
|
||||
if (csv_stream_.is_open()) csv_stream_.close();
|
||||
}
|
||||
|
||||
struct ParseError {
|
||||
enum class ErrorCode : uint8_t { BAD_HEADER, NO_CLOSING_QUOTE, UNEXPECTED_TOKEN, BAD_NUM_OF_COLUMNS, NULL_BYTE };
|
||||
ParseError(ErrorCode code, std::string message) : code(code), message(std::move(message)) {}
|
||||
|
||||
ErrorCode code;
|
||||
std::string message;
|
||||
};
|
||||
|
||||
using ParsingResult = utils::BasicResult<ParseError, Row>;
|
||||
std::optional<Row> GetNextRow();
|
||||
|
||||
private:
|
||||
std::filesystem::path path_;
|
||||
std::ifstream csv_stream_;
|
||||
Config read_config_;
|
||||
uint64_t line_count_{1};
|
||||
uint16_t number_of_columns_{0};
|
||||
|
||||
struct Header {
|
||||
Header() = default;
|
||||
explicit Header(std::vector<std::string> cols) : columns(std::move(cols)) {}
|
||||
std::vector<std::string> columns;
|
||||
};
|
||||
|
||||
std::optional<Header> header_{};
|
||||
|
||||
void InitializeStream();
|
||||
|
||||
std::optional<std::string> GetNextLine();
|
||||
|
||||
std::optional<Header> ParseHeader();
|
||||
|
||||
ParsingResult ParseRow();
|
||||
};
|
||||
|
||||
} // namespace csv
|
@ -57,7 +57,7 @@ bool RenamePath(const std::filesystem::path &src, const std::filesystem::path &d
|
||||
/// `write` for each of our (very small) logical reads/writes. Because of that,
|
||||
/// `read` or `write` is only called when the buffer is full and/or needs
|
||||
/// emptying.
|
||||
const size_t kFileBufferSize = 262144;
|
||||
constexpr size_t kFileBufferSize = 262144;
|
||||
|
||||
/// This class implements a file handler that is used to read binary files. It
|
||||
/// was developed because the C++ standard library has an awful API and makes
|
||||
|
@ -226,6 +226,9 @@ target_link_libraries(${test_prefix}utils_file_locker mg-utils fmt)
|
||||
add_unit_test(utils_thread_pool.cpp)
|
||||
target_link_libraries(${test_prefix}utils_thread_pool mg-utils fmt)
|
||||
|
||||
add_unit_test(csv_parsing.cpp ${CMAKE_SOURCE_DIR}/src/utils/csv_parsing.cpp)
|
||||
target_link_libraries(${test_prefix}csv_parsing mg-utils fmt)
|
||||
|
||||
# Test mg-storage-v2
|
||||
|
||||
add_unit_test(commit_log_v2.cpp)
|
||||
|
194
tests/unit/csv_parsing.cpp
Normal file
194
tests/unit/csv_parsing.cpp
Normal file
@ -0,0 +1,194 @@
|
||||
#include "utils/csv_parsing.hpp"
|
||||
#include "gmock/gmock.h"
|
||||
#include "gtest/gtest.h"
|
||||
|
||||
#include "utils/string.hpp"
|
||||
|
||||
|
||||
class CsvReaderTest : public ::testing::Test {
|
||||
protected:
|
||||
const std::filesystem::path csv_directory{std::filesystem::temp_directory_path() / "csv_testing"};
|
||||
|
||||
void SetUp() override { Clear(); CreateCsvDir(); }
|
||||
|
||||
void TearDown() override { Clear(); }
|
||||
|
||||
private:
|
||||
|
||||
void CreateCsvDir() {
|
||||
if (!std::filesystem::exists(csv_directory)) {
|
||||
std::filesystem::create_directory(csv_directory);
|
||||
}
|
||||
}
|
||||
void Clear() {
|
||||
if (!std::filesystem::exists(csv_directory)) return;
|
||||
std::filesystem::remove_all(csv_directory);
|
||||
}
|
||||
};
|
||||
|
||||
namespace {
|
||||
class FileWriter {
|
||||
public:
|
||||
explicit FileWriter(const std::filesystem::path path) { stream_.open(path); }
|
||||
|
||||
FileWriter(const FileWriter &) = delete;
|
||||
FileWriter &operator=(const FileWriter &) = delete;
|
||||
|
||||
FileWriter(FileWriter &&) = delete;
|
||||
FileWriter &operator=(FileWriter &&) = delete;
|
||||
|
||||
void Close() { stream_.close(); }
|
||||
|
||||
size_t WriteLine(const std::string_view line) {
|
||||
if (!stream_.is_open()) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
stream_ << line << std::endl;
|
||||
|
||||
// including the newline character
|
||||
return line.size() + 1;
|
||||
}
|
||||
|
||||
private:
|
||||
std::ofstream stream_;
|
||||
};
|
||||
|
||||
std::string CreateRow(const std::vector<std::string> &columns, const std::string_view delim) {
|
||||
return utils::Join(columns, delim);
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
TEST_F(CsvReaderTest, CommaDelimiter) {
|
||||
// create a file with a valid and an invalid row;
|
||||
// the invalid row has wrong delimiters;
|
||||
// expect the parser's output to be a single string for the invalid row;
|
||||
const auto filepath = csv_directory / "bla.csv";
|
||||
auto writer = FileWriter(filepath);
|
||||
|
||||
const std::vector<std::string> columns1{"A", "B", "C"};
|
||||
writer.WriteLine(CreateRow(columns1, ","));
|
||||
|
||||
const std::vector<std::string> columns2{"D", "E", "F"};
|
||||
writer.WriteLine(CreateRow(columns2, ";"));
|
||||
|
||||
writer.Close();
|
||||
|
||||
// note - default delimiter is ","
|
||||
auto reader = csv::Reader(filepath);
|
||||
|
||||
auto parsed_row = reader.GetNextRow();
|
||||
ASSERT_EQ(parsed_row->columns, columns1);
|
||||
|
||||
EXPECT_THROW(reader.GetNextRow(), csv::CsvReadException);
|
||||
}
|
||||
|
||||
TEST_F(CsvReaderTest, SemicolonDelimiter) {
|
||||
const auto filepath = csv_directory / "bla.csv";
|
||||
auto writer = FileWriter(filepath);
|
||||
|
||||
const std::string delimiter = ";";
|
||||
const std::vector<std::string> columns1{"A", "B", "C"};
|
||||
writer.WriteLine(CreateRow(columns1, delimiter));
|
||||
|
||||
const std::vector<std::string> columns2{"A", "B", "C"};
|
||||
writer.WriteLine(CreateRow(columns2, ","));
|
||||
|
||||
writer.Close();
|
||||
|
||||
const csv::Reader::Config cfg(delimiter, "\"", false, false);
|
||||
auto reader = csv::Reader(filepath, cfg);
|
||||
|
||||
auto parsed_row = reader.GetNextRow();
|
||||
ASSERT_EQ(parsed_row->columns, columns1);
|
||||
|
||||
EXPECT_THROW(reader.GetNextRow(), csv::CsvReadException);
|
||||
}
|
||||
|
||||
TEST_F(CsvReaderTest, SkipBad) {
|
||||
// create a file with invalid first two rows (containing a string with a
|
||||
// missing closing quote);
|
||||
// the last row is valid;
|
||||
const auto filepath = csv_directory / "bla.csv";
|
||||
auto writer = FileWriter(filepath);
|
||||
|
||||
const std::string delimiter = ",";
|
||||
|
||||
const std::vector<std::string> columns_bad{"A", "B", "\"C"};
|
||||
writer.WriteLine(CreateRow(columns_bad, delimiter));
|
||||
writer.WriteLine(CreateRow(columns_bad, delimiter));
|
||||
|
||||
const std::vector<std::string> columns_good{"A", "B", "C"};
|
||||
writer.WriteLine(CreateRow(columns_good, delimiter));
|
||||
|
||||
writer.Close();
|
||||
|
||||
{
|
||||
// we set the 'skip_bad' flag in the read configuration to 'true';
|
||||
// parser's output should be solely the valid row;
|
||||
const bool skip_bad = true;
|
||||
const csv::Reader::Config cfg(delimiter, "\"", false, skip_bad);
|
||||
auto reader = csv::Reader(filepath, cfg);
|
||||
|
||||
auto parsed_row = reader.GetNextRow();
|
||||
ASSERT_EQ(parsed_row->columns, columns_good);
|
||||
}
|
||||
|
||||
{
|
||||
// we set the 'skip_bad' flag in the read configuration to 'false';
|
||||
// an exception must be thrown;
|
||||
const bool skip_bad = false;
|
||||
const csv::Reader::Config cfg(delimiter, "\"", false, skip_bad);
|
||||
auto reader = csv::Reader(filepath, cfg);
|
||||
|
||||
EXPECT_THROW(reader.GetNextRow(), csv::CsvReadException);
|
||||
}
|
||||
}
|
||||
|
||||
TEST_F(CsvReaderTest, AllRowsValid) {
|
||||
// create a file with all rows valid;
|
||||
// parser should return 'std::nullopt'
|
||||
const auto filepath = csv_directory / "bla.csv";
|
||||
auto writer = FileWriter(filepath);
|
||||
|
||||
const std::string delimiter = ",";
|
||||
|
||||
const std::vector<std::string> columns{"A", "B", "C"};
|
||||
writer.WriteLine(CreateRow(columns, delimiter));
|
||||
writer.WriteLine(CreateRow(columns, delimiter));
|
||||
writer.WriteLine(CreateRow(columns, delimiter));
|
||||
|
||||
writer.Close();
|
||||
|
||||
const bool skip_bad = false;
|
||||
const csv::Reader::Config cfg(delimiter, "\"", false, skip_bad);
|
||||
auto reader = csv::Reader(filepath, cfg);
|
||||
|
||||
while (auto parsed_row = reader.GetNextRow()) {
|
||||
ASSERT_EQ(parsed_row->columns, columns);
|
||||
}
|
||||
}
|
||||
|
||||
TEST_F(CsvReaderTest, SkipAllRows) {
|
||||
// create a file with all rows invalid (containing a string with a missing closing quote);
|
||||
// parser should return 'std::nullopt'
|
||||
const auto filepath = csv_directory / "bla.csv";
|
||||
auto writer = FileWriter(filepath);
|
||||
|
||||
const std::string delimiter = ",";
|
||||
|
||||
const std::vector<std::string> columns_bad{"A", "B", "\"C"};
|
||||
writer.WriteLine(CreateRow(columns_bad, delimiter));
|
||||
writer.WriteLine(CreateRow(columns_bad, delimiter));
|
||||
writer.WriteLine(CreateRow(columns_bad, delimiter));
|
||||
|
||||
writer.Close();
|
||||
|
||||
const bool skip_bad = true;
|
||||
const csv::Reader::Config cfg(delimiter, "\"", false, skip_bad);
|
||||
auto reader = csv::Reader(filepath, cfg);
|
||||
|
||||
auto parsed_row = reader.GetNextRow();
|
||||
ASSERT_EQ(parsed_row, std::nullopt);
|
||||
}
|
Loading…
Reference in New Issue
Block a user