2021-03-02 02:47:41 +08:00
|
|
|
#include "utils/csv_parsing.hpp"
|
|
|
|
|
|
|
|
#include <string_view>
|
|
|
|
|
|
|
|
#include "utils/file.hpp"
|
|
|
|
#include "utils/string.hpp"
|
|
|
|
|
|
|
|
namespace csv {
|
|
|
|
|
|
|
|
using ParseError = Reader::ParseError;
|
|
|
|
|
|
|
|
void Reader::InitializeStream() {
|
|
|
|
if (!std::filesystem::exists(path_)) {
|
|
|
|
throw CsvReadException("CSV file not found: {}", path_.string());
|
|
|
|
}
|
|
|
|
csv_stream_.open(path_);
|
|
|
|
if (!csv_stream_.good()) {
|
|
|
|
throw CsvReadException("CSV file {} couldn't be opened!", path_.string());
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2021-03-19 00:24:25 +08:00
|
|
|
std::optional<utils::pmr::string> Reader::GetNextLine() {
|
|
|
|
utils::pmr::string line(memory_);
|
2021-03-02 02:47:41 +08:00
|
|
|
if (!std::getline(csv_stream_, line)) {
|
|
|
|
// reached end of file or an I/0 error occurred
|
|
|
|
if (!csv_stream_.good()) {
|
|
|
|
csv_stream_.close();
|
|
|
|
}
|
|
|
|
return std::nullopt;
|
|
|
|
}
|
|
|
|
++line_count_;
|
|
|
|
return line;
|
|
|
|
}
|
|
|
|
|
2021-03-19 00:24:25 +08:00
|
|
|
Reader::ParsingResult Reader::ParseHeader() {
|
2021-03-02 02:47:41 +08:00
|
|
|
// header must be the very first line in the file
|
|
|
|
MG_ASSERT(line_count_ == 1, fmt::format("Invalid use of {}", __func__));
|
2021-03-19 00:24:25 +08:00
|
|
|
return ParseRow();
|
|
|
|
}
|
|
|
|
|
|
|
|
void Reader::TryInitializeHeader() {
|
|
|
|
if (!HasHeader()) {
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
auto header = ParseHeader();
|
|
|
|
if (header.HasError()) {
|
|
|
|
throw CsvReadException("CSV reading : {}", header.GetError().message);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (header->empty()) {
|
2021-03-02 02:47:41 +08:00
|
|
|
throw CsvReadException("CSV file {} empty!", path_);
|
|
|
|
}
|
2021-03-19 00:24:25 +08:00
|
|
|
|
|
|
|
number_of_columns_ = header->size();
|
|
|
|
header_ = *header;
|
2021-03-02 02:47:41 +08:00
|
|
|
}
|
|
|
|
|
2021-03-19 00:24:25 +08:00
|
|
|
[[nodiscard]] bool Reader::HasHeader() const { return read_config_.with_header; }
|
|
|
|
|
|
|
|
const std::optional<Reader::Header> &Reader::GetHeader() const { return header_; }
|
|
|
|
|
2021-03-02 02:47:41 +08:00
|
|
|
namespace {
|
|
|
|
enum class CsvParserState : uint8_t {
|
|
|
|
INITIAL_FIELD,
|
|
|
|
NEXT_FIELD,
|
|
|
|
QUOTING,
|
|
|
|
NOT_QUOTING,
|
|
|
|
EXPECT_DELIMITER,
|
|
|
|
};
|
|
|
|
|
|
|
|
bool SubstringStartsWith(const std::string_view str, size_t pos, const std::string_view what) {
|
|
|
|
return utils::StartsWith(utils::Substr(str, pos), what);
|
|
|
|
}
|
|
|
|
} // namespace
|
|
|
|
|
|
|
|
Reader::ParsingResult Reader::ParseRow() {
|
2021-03-19 00:24:25 +08:00
|
|
|
utils::pmr::vector<utils::pmr::string> row(memory_);
|
|
|
|
utils::pmr::string column(memory_);
|
2021-03-02 02:47:41 +08:00
|
|
|
|
|
|
|
auto state = CsvParserState::INITIAL_FIELD;
|
|
|
|
|
|
|
|
do {
|
|
|
|
const auto maybe_line = GetNextLine();
|
|
|
|
if (!maybe_line) {
|
|
|
|
// The whole file was processed.
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
for (size_t i = 0; i < maybe_line->size(); ++i) {
|
|
|
|
const auto c = (*maybe_line)[i];
|
|
|
|
|
|
|
|
// Line feeds and carriage returns are ignored in CSVs.
|
|
|
|
if (c == '\n' || c == '\r') continue;
|
|
|
|
// Null bytes aren't allowed in CSVs.
|
|
|
|
if (c == '\0') {
|
|
|
|
return ParseError(ParseError::ErrorCode::NULL_BYTE,
|
2021-03-19 00:24:25 +08:00
|
|
|
fmt::format("CSV: Line {:d} contains NULL byte", line_count_ - 1));
|
2021-03-02 02:47:41 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
switch (state) {
|
|
|
|
case CsvParserState::INITIAL_FIELD:
|
|
|
|
case CsvParserState::NEXT_FIELD: {
|
2021-03-19 00:24:25 +08:00
|
|
|
if (SubstringStartsWith(*maybe_line, i, *read_config_.quote)) {
|
2021-03-02 02:47:41 +08:00
|
|
|
// The current field is a quoted field.
|
|
|
|
state = CsvParserState::QUOTING;
|
2021-03-19 00:24:25 +08:00
|
|
|
i += read_config_.quote->size() - 1;
|
|
|
|
} else if (SubstringStartsWith(*maybe_line, i, *read_config_.delimiter)) {
|
2021-03-02 02:47:41 +08:00
|
|
|
// The current field has an empty value.
|
|
|
|
row.emplace_back("");
|
|
|
|
state = CsvParserState::NEXT_FIELD;
|
2021-03-19 00:24:25 +08:00
|
|
|
i += read_config_.delimiter->size() - 1;
|
2021-03-02 02:47:41 +08:00
|
|
|
} else {
|
|
|
|
// The current field is a regular field.
|
|
|
|
column.push_back(c);
|
|
|
|
state = CsvParserState::NOT_QUOTING;
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
case CsvParserState::QUOTING: {
|
2021-03-19 00:24:25 +08:00
|
|
|
auto quote_now = SubstringStartsWith(*maybe_line, i, *read_config_.quote);
|
|
|
|
auto quote_next = SubstringStartsWith(*maybe_line, i + read_config_.quote->size(), *read_config_.quote);
|
2021-03-02 02:47:41 +08:00
|
|
|
if (quote_now && quote_next) {
|
|
|
|
// This is an escaped quote character.
|
2021-03-19 00:24:25 +08:00
|
|
|
column += *read_config_.quote;
|
|
|
|
i += read_config_.quote->size() * 2 - 1;
|
2021-03-02 02:47:41 +08:00
|
|
|
} else if (quote_now && !quote_next) {
|
|
|
|
// This is the end of the quoted field.
|
|
|
|
row.emplace_back(std::move(column));
|
|
|
|
state = CsvParserState::EXPECT_DELIMITER;
|
2021-03-19 00:24:25 +08:00
|
|
|
i += read_config_.quote->size() - 1;
|
2021-03-02 02:47:41 +08:00
|
|
|
} else {
|
|
|
|
column.push_back(c);
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
case CsvParserState::NOT_QUOTING: {
|
2021-03-19 00:24:25 +08:00
|
|
|
if (SubstringStartsWith(*maybe_line, i, *read_config_.delimiter)) {
|
2021-03-02 02:47:41 +08:00
|
|
|
row.emplace_back(std::move(column));
|
|
|
|
state = CsvParserState::NEXT_FIELD;
|
2021-03-19 00:24:25 +08:00
|
|
|
i += read_config_.delimiter->size() - 1;
|
2021-03-02 02:47:41 +08:00
|
|
|
} else {
|
|
|
|
column.push_back(c);
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
case CsvParserState::EXPECT_DELIMITER: {
|
2021-03-19 00:24:25 +08:00
|
|
|
if (SubstringStartsWith(*maybe_line, i, *read_config_.delimiter)) {
|
2021-03-02 02:47:41 +08:00
|
|
|
state = CsvParserState::NEXT_FIELD;
|
2021-03-19 00:24:25 +08:00
|
|
|
i += read_config_.delimiter->size() - 1;
|
2021-03-02 02:47:41 +08:00
|
|
|
} else {
|
|
|
|
return ParseError(ParseError::ErrorCode::UNEXPECTED_TOKEN,
|
2021-03-19 00:24:25 +08:00
|
|
|
fmt::format("CSV Reader: Expected '{}' after '{}', but got '{}' at line {:d}",
|
|
|
|
*read_config_.delimiter, *read_config_.quote, c, line_count_ - 1));
|
2021-03-02 02:47:41 +08:00
|
|
|
}
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
} while (state == CsvParserState::QUOTING);
|
|
|
|
|
|
|
|
switch (state) {
|
|
|
|
case CsvParserState::INITIAL_FIELD: {
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
case CsvParserState::NEXT_FIELD: {
|
|
|
|
row.emplace_back(std::move(column));
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
case CsvParserState::QUOTING: {
|
|
|
|
return ParseError(ParseError::ErrorCode::NO_CLOSING_QUOTE,
|
|
|
|
"There is no more data left to load while inside a quoted string. "
|
|
|
|
"Did you forget to close the quote?");
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
case CsvParserState::NOT_QUOTING: {
|
|
|
|
row.emplace_back(std::move(column));
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
case CsvParserState::EXPECT_DELIMITER: {
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// reached the end of file - return empty row
|
|
|
|
if (row.empty()) {
|
2021-03-19 00:24:25 +08:00
|
|
|
return row;
|
2021-03-02 02:47:41 +08:00
|
|
|
}
|
|
|
|
|
2021-03-19 00:24:25 +08:00
|
|
|
// Has header, but the header has already been read and the number_of_columns_
|
|
|
|
// is already set. Otherwise, we would get an error every time we'd try to
|
|
|
|
// parse the header.
|
|
|
|
// Also, if we don't have a header, the 'number_of_columns_' will be 0, so no
|
|
|
|
// need to check the number of columns.
|
|
|
|
if (UNLIKELY(number_of_columns_ != 0 && row.size() != number_of_columns_)) {
|
|
|
|
return ParseError(ParseError::ErrorCode::BAD_NUM_OF_COLUMNS,
|
|
|
|
// ToDo(the-joksim):
|
|
|
|
// - 'line_count_ - 1' is the last line of a row (as a
|
|
|
|
// row may span several lines) ==> should have a row
|
|
|
|
// counter
|
|
|
|
fmt::format("Expected {:d} columns in row {:d}, but got {:d}", number_of_columns_,
|
|
|
|
line_count_ - 1, row.size()));
|
2021-03-02 02:47:41 +08:00
|
|
|
}
|
|
|
|
|
2021-03-19 00:24:25 +08:00
|
|
|
return row;
|
2021-03-02 02:47:41 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
// Returns Reader::Row if the read row if valid;
|
|
|
|
// Returns std::nullopt if end of file is reached or an error occurred
|
|
|
|
// making it unreadable;
|
2021-03-19 00:24:25 +08:00
|
|
|
// @throws CsvReadException if a bad row is encountered, and the ignore_bad is set
|
2021-03-02 02:47:41 +08:00
|
|
|
// to 'true' in the Reader::Config.
|
|
|
|
std::optional<Reader::Row> Reader::GetNextRow() {
|
|
|
|
auto row = ParseRow();
|
|
|
|
|
|
|
|
if (row.HasError()) {
|
2021-03-19 00:24:25 +08:00
|
|
|
if (!read_config_.ignore_bad) {
|
|
|
|
throw CsvReadException("CSV Reader: Bad row at line {:d}: {}", line_count_ - 1, row.GetError().message);
|
2021-03-02 02:47:41 +08:00
|
|
|
}
|
|
|
|
// try to parse as many times as necessary to reach a valid row
|
|
|
|
do {
|
2021-03-19 00:24:25 +08:00
|
|
|
spdlog::debug("CSV Reader: Bad row at line {:d}: {}", line_count_ - 1, row.GetError().message);
|
2021-03-02 02:47:41 +08:00
|
|
|
if (!csv_stream_.good()) {
|
|
|
|
return std::nullopt;
|
|
|
|
}
|
|
|
|
row = ParseRow();
|
|
|
|
} while (row.HasError());
|
|
|
|
}
|
|
|
|
|
2021-03-19 00:24:25 +08:00
|
|
|
if (row->empty()) {
|
2021-03-02 02:47:41 +08:00
|
|
|
// reached end of file
|
|
|
|
return std::nullopt;
|
|
|
|
}
|
2021-03-19 00:24:25 +08:00
|
|
|
return *row;
|
2021-03-02 02:47:41 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
} // namespace csv
|