2011-03-19 06:37:00 +08:00
|
|
|
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
|
|
|
// Use of this source code is governed by a BSD-style license that can be
|
|
|
|
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
|
|
|
|
|
|
|
#include "db/log_reader.h"
|
|
|
|
|
2011-07-15 08:20:57 +08:00
|
|
|
#include <stdio.h>
|
2011-03-31 02:35:40 +08:00
|
|
|
#include "leveldb/env.h"
|
2011-03-19 06:37:00 +08:00
|
|
|
#include "util/coding.h"
|
|
|
|
#include "util/crc32c.h"
|
|
|
|
|
|
|
|
namespace leveldb {
|
|
|
|
namespace log {
|
|
|
|
|
|
|
|
Reader::Reporter::~Reporter() {
|
|
|
|
}
|
|
|
|
|
2011-05-21 10:17:43 +08:00
|
|
|
Reader::Reader(SequentialFile* file, Reporter* reporter, bool checksum,
|
|
|
|
uint64_t initial_offset)
|
2011-03-19 06:37:00 +08:00
|
|
|
: file_(file),
|
|
|
|
reporter_(reporter),
|
|
|
|
checksum_(checksum),
|
|
|
|
backing_store_(new char[kBlockSize]),
|
|
|
|
buffer_(),
|
2011-05-21 10:17:43 +08:00
|
|
|
eof_(false),
|
|
|
|
last_record_offset_(0),
|
|
|
|
end_of_buffer_offset_(0),
|
2015-08-12 06:36:45 +08:00
|
|
|
initial_offset_(initial_offset),
|
|
|
|
resyncing_(initial_offset > 0) {
|
2011-03-19 06:37:00 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
Reader::~Reader() {
|
|
|
|
delete[] backing_store_;
|
|
|
|
}
|
|
|
|
|
2011-05-21 10:17:43 +08:00
|
|
|
bool Reader::SkipToInitialBlock() {
|
2017-10-20 01:33:42 +08:00
|
|
|
const size_t offset_in_block = initial_offset_ % kBlockSize;
|
2011-05-21 10:17:43 +08:00
|
|
|
uint64_t block_start_location = initial_offset_ - offset_in_block;
|
|
|
|
|
|
|
|
// Don't search a block if we'd be in the trailer
|
|
|
|
if (offset_in_block > kBlockSize - 6) {
|
|
|
|
block_start_location += kBlockSize;
|
|
|
|
}
|
|
|
|
|
|
|
|
end_of_buffer_offset_ = block_start_location;
|
|
|
|
|
|
|
|
// Skip to start of first block that can contain the initial record
|
|
|
|
if (block_start_location > 0) {
|
|
|
|
Status skip_status = file_->Skip(block_start_location);
|
|
|
|
if (!skip_status.ok()) {
|
|
|
|
ReportDrop(block_start_location, skip_status);
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2011-03-19 06:37:00 +08:00
|
|
|
bool Reader::ReadRecord(Slice* record, std::string* scratch) {
|
2011-05-21 10:17:43 +08:00
|
|
|
if (last_record_offset_ < initial_offset_) {
|
|
|
|
if (!SkipToInitialBlock()) {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2011-03-19 06:37:00 +08:00
|
|
|
scratch->clear();
|
|
|
|
record->clear();
|
|
|
|
bool in_fragmented_record = false;
|
2011-05-21 10:17:43 +08:00
|
|
|
// Record offset of the logical record that we're reading
|
|
|
|
// 0 is a dummy value to make compilers happy
|
|
|
|
uint64_t prospective_record_offset = 0;
|
2011-03-19 06:37:00 +08:00
|
|
|
|
|
|
|
Slice fragment;
|
|
|
|
while (true) {
|
2011-07-15 08:20:57 +08:00
|
|
|
const unsigned int record_type = ReadPhysicalRecord(&fragment);
|
This CL fixes a bug encountered when reading records from leveldb files that have been split, as in a [] input task split.
Detailed description:
Suppose an input split is generated between two leveldb record blocks and the preceding block ends with null padding.
A reader that previously read at least 1 record within the first block (before encountering the padding) upon trying to read the next record, will successfully and correctly read the next logical record from the subsequent block, but will return a last record offset pointing to the padding in the first block.
When this happened in a [], it resulted in duplicate records being handled at what appeared to be different offsets that were separated by only a few bytes.
This behavior is only observed when at least 1 record was read from the first block before encountering the padding. If the initial offset for a reader was within the padding, the correct record offset would be reported, namely the offset within the second block.
The tests failed to catch this scenario/bug, because each read test only read a single record with an initial offset. This CL adds an explicit test case for this scenario, and modifies the test structure to read all remaining records in the test case after an initial offset is specified. Thus an initial offset that jumps to record #3, with 5 total records in the test file, will result in reading 2 records, and validating the offset of each of them in order to pass successfully.
-------------
Created by MOE: https://github.com/google/moe
MOE_MIGRATED_REVID=115338487
2016-02-23 23:36:39 +08:00
|
|
|
|
|
|
|
// ReadPhysicalRecord may have only had an empty trailer remaining in its
|
|
|
|
// internal buffer. Calculate the offset of the next physical record now
|
|
|
|
// that it has returned, properly accounting for its header size.
|
|
|
|
uint64_t physical_record_offset =
|
|
|
|
end_of_buffer_offset_ - buffer_.size() - kHeaderSize - fragment.size();
|
|
|
|
|
2015-08-12 06:36:45 +08:00
|
|
|
if (resyncing_) {
|
|
|
|
if (record_type == kMiddleType) {
|
|
|
|
continue;
|
|
|
|
} else if (record_type == kLastType) {
|
|
|
|
resyncing_ = false;
|
|
|
|
continue;
|
|
|
|
} else {
|
|
|
|
resyncing_ = false;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2011-07-15 08:20:57 +08:00
|
|
|
switch (record_type) {
|
2011-03-19 06:37:00 +08:00
|
|
|
case kFullType:
|
|
|
|
if (in_fragmented_record) {
|
2011-05-21 10:17:43 +08:00
|
|
|
// Handle bug in earlier versions of log::Writer where
|
|
|
|
// it could emit an empty kFirstType record at the tail end
|
|
|
|
// of a block followed by a kFullType or kFirstType record
|
|
|
|
// at the beginning of the next block.
|
2017-10-20 01:33:42 +08:00
|
|
|
if (!scratch->empty()) {
|
2011-05-21 10:17:43 +08:00
|
|
|
ReportCorruption(scratch->size(), "partial record without end(1)");
|
|
|
|
}
|
2011-03-19 06:37:00 +08:00
|
|
|
}
|
2011-05-21 10:17:43 +08:00
|
|
|
prospective_record_offset = physical_record_offset;
|
2011-03-19 06:37:00 +08:00
|
|
|
scratch->clear();
|
|
|
|
*record = fragment;
|
2011-05-21 10:17:43 +08:00
|
|
|
last_record_offset_ = prospective_record_offset;
|
2011-03-19 06:37:00 +08:00
|
|
|
return true;
|
|
|
|
|
|
|
|
case kFirstType:
|
|
|
|
if (in_fragmented_record) {
|
2011-05-21 10:17:43 +08:00
|
|
|
// Handle bug in earlier versions of log::Writer where
|
|
|
|
// it could emit an empty kFirstType record at the tail end
|
|
|
|
// of a block followed by a kFullType or kFirstType record
|
|
|
|
// at the beginning of the next block.
|
2017-10-20 01:33:42 +08:00
|
|
|
if (!scratch->empty()) {
|
2011-05-21 10:17:43 +08:00
|
|
|
ReportCorruption(scratch->size(), "partial record without end(2)");
|
|
|
|
}
|
2011-03-19 06:37:00 +08:00
|
|
|
}
|
2011-05-21 10:17:43 +08:00
|
|
|
prospective_record_offset = physical_record_offset;
|
2011-03-19 06:37:00 +08:00
|
|
|
scratch->assign(fragment.data(), fragment.size());
|
|
|
|
in_fragmented_record = true;
|
|
|
|
break;
|
|
|
|
|
|
|
|
case kMiddleType:
|
|
|
|
if (!in_fragmented_record) {
|
2011-05-21 10:17:43 +08:00
|
|
|
ReportCorruption(fragment.size(),
|
|
|
|
"missing start of fragmented record(1)");
|
2011-03-19 06:37:00 +08:00
|
|
|
} else {
|
|
|
|
scratch->append(fragment.data(), fragment.size());
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
|
|
|
|
case kLastType:
|
|
|
|
if (!in_fragmented_record) {
|
2011-05-21 10:17:43 +08:00
|
|
|
ReportCorruption(fragment.size(),
|
|
|
|
"missing start of fragmented record(2)");
|
2011-03-19 06:37:00 +08:00
|
|
|
} else {
|
|
|
|
scratch->append(fragment.data(), fragment.size());
|
|
|
|
*record = Slice(*scratch);
|
2011-05-21 10:17:43 +08:00
|
|
|
last_record_offset_ = prospective_record_offset;
|
2011-03-19 06:37:00 +08:00
|
|
|
return true;
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
|
|
|
|
case kEof:
|
|
|
|
if (in_fragmented_record) {
|
2014-02-11 03:36:06 +08:00
|
|
|
// This can be caused by the writer dying immediately after
|
|
|
|
// writing a physical record but before completing the next; don't
|
|
|
|
// treat it as a corruption, just ignore the entire logical record.
|
2011-03-19 06:37:00 +08:00
|
|
|
scratch->clear();
|
|
|
|
}
|
|
|
|
return false;
|
|
|
|
|
|
|
|
case kBadRecord:
|
|
|
|
if (in_fragmented_record) {
|
2011-05-21 10:17:43 +08:00
|
|
|
ReportCorruption(scratch->size(), "error in middle of record");
|
2011-03-19 06:37:00 +08:00
|
|
|
in_fragmented_record = false;
|
|
|
|
scratch->clear();
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
|
2011-07-15 08:20:57 +08:00
|
|
|
default: {
|
|
|
|
char buf[40];
|
|
|
|
snprintf(buf, sizeof(buf), "unknown record type %u", record_type);
|
2011-05-21 10:17:43 +08:00
|
|
|
ReportCorruption(
|
2011-03-19 06:37:00 +08:00
|
|
|
(fragment.size() + (in_fragmented_record ? scratch->size() : 0)),
|
2011-07-15 08:20:57 +08:00
|
|
|
buf);
|
2011-03-19 06:37:00 +08:00
|
|
|
in_fragmented_record = false;
|
|
|
|
scratch->clear();
|
|
|
|
break;
|
2011-07-15 08:20:57 +08:00
|
|
|
}
|
2011-03-19 06:37:00 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2011-05-21 10:17:43 +08:00
|
|
|
uint64_t Reader::LastRecordOffset() {
|
|
|
|
return last_record_offset_;
|
|
|
|
}
|
|
|
|
|
2014-09-17 05:19:52 +08:00
|
|
|
void Reader::ReportCorruption(uint64_t bytes, const char* reason) {
|
2011-05-21 10:17:43 +08:00
|
|
|
ReportDrop(bytes, Status::Corruption(reason));
|
|
|
|
}
|
|
|
|
|
2014-09-17 05:19:52 +08:00
|
|
|
void Reader::ReportDrop(uint64_t bytes, const Status& reason) {
|
2011-05-21 10:17:43 +08:00
|
|
|
if (reporter_ != NULL &&
|
|
|
|
end_of_buffer_offset_ - buffer_.size() - bytes >= initial_offset_) {
|
2014-09-17 05:19:52 +08:00
|
|
|
reporter_->Corruption(static_cast<size_t>(bytes), reason);
|
2011-03-19 06:37:00 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
unsigned int Reader::ReadPhysicalRecord(Slice* result) {
|
|
|
|
while (true) {
|
2011-03-29 04:43:44 +08:00
|
|
|
if (buffer_.size() < kHeaderSize) {
|
2011-03-19 06:37:00 +08:00
|
|
|
if (!eof_) {
|
|
|
|
// Last read was a full read, so this is a trailer to skip
|
|
|
|
buffer_.clear();
|
|
|
|
Status status = file_->Read(kBlockSize, &buffer_, backing_store_);
|
2011-05-21 10:17:43 +08:00
|
|
|
end_of_buffer_offset_ += buffer_.size();
|
2011-03-19 06:37:00 +08:00
|
|
|
if (!status.ok()) {
|
|
|
|
buffer_.clear();
|
2011-05-21 10:17:43 +08:00
|
|
|
ReportDrop(kBlockSize, status);
|
2011-03-19 06:37:00 +08:00
|
|
|
eof_ = true;
|
|
|
|
return kEof;
|
|
|
|
} else if (buffer_.size() < kBlockSize) {
|
|
|
|
eof_ = true;
|
|
|
|
}
|
|
|
|
continue;
|
2011-03-29 04:43:44 +08:00
|
|
|
} else {
|
2014-02-11 03:36:06 +08:00
|
|
|
// Note that if buffer_ is non-empty, we have a truncated header at the
|
|
|
|
// end of the file, which can be caused by the writer crashing in the
|
|
|
|
// middle of writing the header. Instead of considering this an error,
|
|
|
|
// just report EOF.
|
2011-03-19 06:37:00 +08:00
|
|
|
buffer_.clear();
|
|
|
|
return kEof;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Parse the header
|
|
|
|
const char* header = buffer_.data();
|
|
|
|
const uint32_t a = static_cast<uint32_t>(header[4]) & 0xff;
|
|
|
|
const uint32_t b = static_cast<uint32_t>(header[5]) & 0xff;
|
|
|
|
const unsigned int type = header[6];
|
|
|
|
const uint32_t length = a | (b << 8);
|
|
|
|
if (kHeaderSize + length > buffer_.size()) {
|
2011-05-21 10:17:43 +08:00
|
|
|
size_t drop_size = buffer_.size();
|
2011-03-19 06:37:00 +08:00
|
|
|
buffer_.clear();
|
2014-02-11 03:36:06 +08:00
|
|
|
if (!eof_) {
|
|
|
|
ReportCorruption(drop_size, "bad record length");
|
|
|
|
return kBadRecord;
|
|
|
|
}
|
|
|
|
// If the end of the file has been reached without reading |length| bytes
|
|
|
|
// of payload, assume the writer died in the middle of writing the record.
|
|
|
|
// Don't report a corruption.
|
|
|
|
return kEof;
|
2011-03-19 06:37:00 +08:00
|
|
|
}
|
|
|
|
|
2011-07-15 08:20:57 +08:00
|
|
|
if (type == kZeroType && length == 0) {
|
|
|
|
// Skip zero length record without reporting any drops since
|
|
|
|
// such records are produced by the mmap based writing code in
|
|
|
|
// env_posix.cc that preallocates file regions.
|
|
|
|
buffer_.clear();
|
|
|
|
return kBadRecord;
|
|
|
|
}
|
|
|
|
|
2011-03-19 06:37:00 +08:00
|
|
|
// Check crc
|
|
|
|
if (checksum_) {
|
|
|
|
uint32_t expected_crc = crc32c::Unmask(DecodeFixed32(header));
|
|
|
|
uint32_t actual_crc = crc32c::Value(header + 6, 1 + length);
|
|
|
|
if (actual_crc != expected_crc) {
|
2011-03-22 03:40:57 +08:00
|
|
|
// Drop the rest of the buffer since "length" itself may have
|
|
|
|
// been corrupted and if we trust it, we could find some
|
|
|
|
// fragment of a real log record that just happens to look
|
|
|
|
// like a valid log record.
|
2011-05-21 10:17:43 +08:00
|
|
|
size_t drop_size = buffer_.size();
|
2011-03-22 03:40:57 +08:00
|
|
|
buffer_.clear();
|
2011-05-21 10:17:43 +08:00
|
|
|
ReportCorruption(drop_size, "checksum mismatch");
|
2011-03-19 06:37:00 +08:00
|
|
|
return kBadRecord;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
buffer_.remove_prefix(kHeaderSize + length);
|
2011-05-21 10:17:43 +08:00
|
|
|
|
|
|
|
// Skip physical record that started before initial_offset_
|
|
|
|
if (end_of_buffer_offset_ - buffer_.size() - kHeaderSize - length <
|
|
|
|
initial_offset_) {
|
|
|
|
result->clear();
|
|
|
|
return kBadRecord;
|
|
|
|
}
|
|
|
|
|
2011-03-19 06:37:00 +08:00
|
|
|
*result = Slice(header + kHeaderSize, length);
|
|
|
|
return type;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2011-11-01 01:22:06 +08:00
|
|
|
} // namespace log
|
|
|
|
} // namespace leveldb
|