Improve error handling (#56)
This commit is contained in:
parent
200ce5f45e
commit
f7b764607d
@ -165,7 +165,7 @@ RecoveredSnapshot LoadSnapshot(
|
|||||||
if (!version)
|
if (!version)
|
||||||
throw RecoveryFailure("Couldn't read snapshot magic and/or version!");
|
throw RecoveryFailure("Couldn't read snapshot magic and/or version!");
|
||||||
if (!IsVersionSupported(*version))
|
if (!IsVersionSupported(*version))
|
||||||
throw RecoveryFailure("Invalid snapshot version!");
|
throw RecoveryFailure(fmt::format("Invalid snapshot version {}", *version));
|
||||||
|
|
||||||
// Cleanup of loaded data in case of failure.
|
// Cleanup of loaded data in case of failure.
|
||||||
bool success = false;
|
bool success = false;
|
||||||
@ -173,6 +173,7 @@ RecoveredSnapshot LoadSnapshot(
|
|||||||
if (!success) {
|
if (!success) {
|
||||||
edges->clear();
|
edges->clear();
|
||||||
vertices->clear();
|
vertices->clear();
|
||||||
|
epoch_history->clear();
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
|
@ -28,14 +28,15 @@ Storage::ReplicationClient::ReplicationClient(
|
|||||||
}
|
}
|
||||||
|
|
||||||
rpc_client_.emplace(endpoint, &*rpc_context_);
|
rpc_client_.emplace(endpoint, &*rpc_context_);
|
||||||
InitializeClient();
|
TryInitializeClient();
|
||||||
|
|
||||||
if (config.timeout) {
|
if (config.timeout && replica_state_ != replication::ReplicaState::INVALID) {
|
||||||
timeout_.emplace(*config.timeout);
|
timeout_.emplace(*config.timeout);
|
||||||
timeout_dispatcher_.emplace();
|
timeout_dispatcher_.emplace();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// @throws rpc::RpcFailedException
|
||||||
void Storage::ReplicationClient::InitializeClient() {
|
void Storage::ReplicationClient::InitializeClient() {
|
||||||
uint64_t current_commit_timestamp{kTimestampInitialId};
|
uint64_t current_commit_timestamp{kTimestampInitialId};
|
||||||
auto stream{
|
auto stream{
|
||||||
@ -74,6 +75,25 @@ void Storage::ReplicationClient::InitializeClient() {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void Storage::ReplicationClient::TryInitializeClient() {
|
||||||
|
try {
|
||||||
|
InitializeClient();
|
||||||
|
} catch (const rpc::RpcFailedException &) {
|
||||||
|
std::unique_lock client_guarde{client_lock_};
|
||||||
|
replica_state_.store(replication::ReplicaState::INVALID);
|
||||||
|
LOG(ERROR) << "Failed to connect to replica " << name_ << " at "
|
||||||
|
<< rpc_client_->Endpoint();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void Storage::ReplicationClient::HandleRpcFailure() {
|
||||||
|
LOG(ERROR) << "Couldn't replicate data to " << name_;
|
||||||
|
thread_pool_.AddTask([this] {
|
||||||
|
rpc_client_->Abort();
|
||||||
|
this->TryInitializeClient();
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
SnapshotRes Storage::ReplicationClient::TransferSnapshot(
|
SnapshotRes Storage::ReplicationClient::TransferSnapshot(
|
||||||
const std::filesystem::path &path) {
|
const std::filesystem::path &path) {
|
||||||
auto stream{rpc_client_->Stream<SnapshotRpc>()};
|
auto stream{rpc_client_->Stream<SnapshotRpc>()};
|
||||||
@ -103,44 +123,39 @@ OnlySnapshotRes Storage::ReplicationClient::TransferOnlySnapshot(
|
|||||||
return stream.AwaitResponse();
|
return stream.AwaitResponse();
|
||||||
}
|
}
|
||||||
|
|
||||||
bool Storage::ReplicationClient::StartTransactionReplication(
|
void Storage::ReplicationClient::StartTransactionReplication(
|
||||||
const uint64_t current_wal_seq_num) {
|
const uint64_t current_wal_seq_num) {
|
||||||
std::unique_lock guard(client_lock_);
|
std::unique_lock guard(client_lock_);
|
||||||
const auto status = replica_state_.load();
|
const auto status = replica_state_.load();
|
||||||
switch (status) {
|
switch (status) {
|
||||||
case replication::ReplicaState::RECOVERY:
|
case replication::ReplicaState::RECOVERY:
|
||||||
DLOG(INFO) << "Replica " << name_ << " is behind MAIN instance";
|
DLOG(INFO) << "Replica " << name_ << " is behind MAIN instance";
|
||||||
return false;
|
return;
|
||||||
case replication::ReplicaState::REPLICATING:
|
case replication::ReplicaState::REPLICATING:
|
||||||
DLOG(INFO) << "Replica missed a transaction, going to recovery";
|
DLOG(INFO) << "Replica " << name_ << " missed a transaction";
|
||||||
|
// We missed a transaction because we're still replicating
|
||||||
|
// the previous transaction so we need to go to RECOVERY
|
||||||
|
// state to catch up with the missing transaction
|
||||||
|
// We cannot queue the recovery process here because
|
||||||
|
// an error can happen while we're replicating the previous
|
||||||
|
// transaction after which the client should go to
|
||||||
|
// INVALID state before starting the recovery process
|
||||||
replica_state_.store(replication::ReplicaState::RECOVERY);
|
replica_state_.store(replication::ReplicaState::RECOVERY);
|
||||||
// If it's in replicating state, it should have been up to date with all
|
return;
|
||||||
// the commits until now so the replica should contain the
|
|
||||||
// last_commit_timestamp
|
|
||||||
thread_pool_.AddTask([=, this] {
|
|
||||||
this->RecoverReplica(storage_->last_commit_timestamp_.load());
|
|
||||||
});
|
|
||||||
return false;
|
|
||||||
case replication::ReplicaState::INVALID:
|
case replication::ReplicaState::INVALID:
|
||||||
LOG(ERROR) << "Couldn't replicate data to " << name_;
|
HandleRpcFailure();
|
||||||
return false;
|
return;
|
||||||
case replication::ReplicaState::READY:
|
case replication::ReplicaState::READY:
|
||||||
CHECK(!replica_stream_);
|
CHECK(!replica_stream_);
|
||||||
try {
|
try {
|
||||||
replica_stream_.emplace(
|
replica_stream_.emplace(
|
||||||
ReplicaStream{this, storage_->last_commit_timestamp_.load(),
|
ReplicaStream{this, storage_->last_commit_timestamp_.load(),
|
||||||
current_wal_seq_num});
|
current_wal_seq_num});
|
||||||
} catch (const rpc::RpcFailedException &) {
|
|
||||||
replica_state_.store(replication::ReplicaState::INVALID);
|
|
||||||
LOG(ERROR) << "Couldn't replicate data to " << name_;
|
|
||||||
thread_pool_.AddTask([this] {
|
|
||||||
rpc_client_->Abort();
|
|
||||||
InitializeClient();
|
|
||||||
});
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
replica_state_.store(replication::ReplicaState::REPLICATING);
|
replica_state_.store(replication::ReplicaState::REPLICATING);
|
||||||
return true;
|
} catch (const rpc::RpcFailedException &) {
|
||||||
|
HandleRpcFailure();
|
||||||
|
}
|
||||||
|
return;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -150,11 +165,7 @@ void Storage::ReplicationClient::IfStreamingTransaction(
|
|||||||
try {
|
try {
|
||||||
callback(*replica_stream_);
|
callback(*replica_stream_);
|
||||||
} catch (const rpc::RpcFailedException &) {
|
} catch (const rpc::RpcFailedException &) {
|
||||||
LOG(ERROR) << "Couldn't replicate data to " << name_;
|
HandleRpcFailure();
|
||||||
thread_pool_.AddTask([this] {
|
|
||||||
rpc_client_->Abort();
|
|
||||||
InitializeClient();
|
|
||||||
});
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -209,25 +220,16 @@ void Storage::ReplicationClient::FinalizeTransactionReplicationInternal() {
|
|||||||
if (replica_stream_) {
|
if (replica_stream_) {
|
||||||
try {
|
try {
|
||||||
auto response = replica_stream_->Finalize();
|
auto response = replica_stream_->Finalize();
|
||||||
if (!response.success) {
|
|
||||||
{
|
|
||||||
std::unique_lock client_guard{client_lock_};
|
std::unique_lock client_guard{client_lock_};
|
||||||
|
if (!response.success ||
|
||||||
|
replica_state_ == replication::ReplicaState::RECOVERY) {
|
||||||
replica_state_.store(replication::ReplicaState::RECOVERY);
|
replica_state_.store(replication::ReplicaState::RECOVERY);
|
||||||
}
|
|
||||||
thread_pool_.AddTask([&, this] {
|
thread_pool_.AddTask([&, this] {
|
||||||
this->RecoverReplica(response.current_commit_timestamp);
|
this->RecoverReplica(response.current_commit_timestamp);
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
} catch (const rpc::RpcFailedException &) {
|
} catch (const rpc::RpcFailedException &) {
|
||||||
LOG(ERROR) << "Couldn't replicate data to " << name_;
|
HandleRpcFailure();
|
||||||
{
|
|
||||||
std::unique_lock client_guard{client_lock_};
|
|
||||||
replica_state_.store(replication::ReplicaState::INVALID);
|
|
||||||
}
|
|
||||||
thread_pool_.AddTask([this] {
|
|
||||||
rpc_client_->Abort();
|
|
||||||
InitializeClient();
|
|
||||||
});
|
|
||||||
}
|
}
|
||||||
replica_stream_.reset();
|
replica_stream_.reset();
|
||||||
}
|
}
|
||||||
@ -244,6 +246,7 @@ void Storage::ReplicationClient::RecoverReplica(uint64_t replica_commit) {
|
|||||||
|
|
||||||
const auto steps = GetRecoverySteps(replica_commit, &file_locker);
|
const auto steps = GetRecoverySteps(replica_commit, &file_locker);
|
||||||
for (const auto &recovery_step : steps) {
|
for (const auto &recovery_step : steps) {
|
||||||
|
try {
|
||||||
std::visit(
|
std::visit(
|
||||||
[&, this]<typename T>(T &&arg) {
|
[&, this]<typename T>(T &&arg) {
|
||||||
using StepType = std::remove_cvref_t<T>;
|
using StepType = std::remove_cvref_t<T>;
|
||||||
@ -251,13 +254,16 @@ void Storage::ReplicationClient::RecoverReplica(uint64_t replica_commit) {
|
|||||||
DLOG(INFO) << "Sending the latest snapshot file: " << arg;
|
DLOG(INFO) << "Sending the latest snapshot file: " << arg;
|
||||||
auto response = TransferSnapshot(arg);
|
auto response = TransferSnapshot(arg);
|
||||||
replica_commit = response.current_commit_timestamp;
|
replica_commit = response.current_commit_timestamp;
|
||||||
DLOG(INFO) << "CURRENT TIMESTAMP ON REPLICA: " << replica_commit;
|
DLOG(INFO) << "CURRENT TIMESTAMP ON REPLICA: "
|
||||||
|
<< replica_commit;
|
||||||
} else if constexpr (std::is_same_v<StepType, RecoveryWals>) {
|
} else if constexpr (std::is_same_v<StepType, RecoveryWals>) {
|
||||||
DLOG(INFO) << "Sending the latest wal files";
|
DLOG(INFO) << "Sending the latest wal files";
|
||||||
auto response = TransferWalFiles(arg);
|
auto response = TransferWalFiles(arg);
|
||||||
replica_commit = response.current_commit_timestamp;
|
replica_commit = response.current_commit_timestamp;
|
||||||
DLOG(INFO) << "CURRENT TIMESTAMP ON REPLICA: " << replica_commit;
|
DLOG(INFO) << "CURRENT TIMESTAMP ON REPLICA: "
|
||||||
} else if constexpr (std::is_same_v<StepType, RecoveryCurrentWal>) {
|
<< replica_commit;
|
||||||
|
} else if constexpr (std::is_same_v<StepType,
|
||||||
|
RecoveryCurrentWal>) {
|
||||||
std::unique_lock transaction_guard(storage_->engine_lock_);
|
std::unique_lock transaction_guard(storage_->engine_lock_);
|
||||||
if (storage_->wal_file_ &&
|
if (storage_->wal_file_ &&
|
||||||
storage_->wal_file_->SequenceNumber() ==
|
storage_->wal_file_->SequenceNumber() ==
|
||||||
@ -266,8 +272,8 @@ void Storage::ReplicationClient::RecoverReplica(uint64_t replica_commit) {
|
|||||||
transaction_guard.unlock();
|
transaction_guard.unlock();
|
||||||
DLOG(INFO) << "Sending current wal file";
|
DLOG(INFO) << "Sending current wal file";
|
||||||
replica_commit = ReplicateCurrentWal();
|
replica_commit = ReplicateCurrentWal();
|
||||||
DLOG(INFO) << "CURRENT TIMESTAMP ON REPLICA: "
|
DLOG(INFO)
|
||||||
<< replica_commit;
|
<< "CURRENT TIMESTAMP ON REPLICA: " << replica_commit;
|
||||||
storage_->wal_file_->EnableFlushing();
|
storage_->wal_file_->EnableFlushing();
|
||||||
}
|
}
|
||||||
} else if constexpr (std::is_same_v<StepType,
|
} else if constexpr (std::is_same_v<StepType,
|
||||||
@ -283,6 +289,9 @@ void Storage::ReplicationClient::RecoverReplica(uint64_t replica_commit) {
|
|||||||
}
|
}
|
||||||
},
|
},
|
||||||
recovery_step);
|
recovery_step);
|
||||||
|
} catch (const rpc::RpcFailedException &) {
|
||||||
|
HandleRpcFailure();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (storage_->last_commit_timestamp_.load() == replica_commit) {
|
if (storage_->last_commit_timestamp_.load() == replica_commit) {
|
||||||
@ -391,7 +400,6 @@ Storage::ReplicationClient::GetRecoverySteps(
|
|||||||
return recovery_steps;
|
return recovery_steps;
|
||||||
}
|
}
|
||||||
|
|
||||||
++rwal_it;
|
|
||||||
uint64_t previous_seq_num{rwal_it->seq_num};
|
uint64_t previous_seq_num{rwal_it->seq_num};
|
||||||
for (; rwal_it != wal_files->rend(); ++rwal_it) {
|
for (; rwal_it != wal_files->rend(); ++rwal_it) {
|
||||||
// If the difference between two consecutive wal files is not 0 or 1
|
// If the difference between two consecutive wal files is not 0 or 1
|
||||||
@ -403,11 +411,12 @@ Storage::ReplicationClient::GetRecoverySteps(
|
|||||||
// Find first WAL that contains up to replica commit, i.e. WAL
|
// Find first WAL that contains up to replica commit, i.e. WAL
|
||||||
// that is before the replica commit or conatins the replica commit
|
// that is before the replica commit or conatins the replica commit
|
||||||
// as the last committed transaction.
|
// as the last committed transaction.
|
||||||
if (replica_commit >= rwal_it->from_timestamp &&
|
if (replica_commit >= rwal_it->from_timestamp) {
|
||||||
replica_commit >= rwal_it->to_timestamp) {
|
if (replica_commit >= rwal_it->to_timestamp) {
|
||||||
// We want the WAL after because the replica already contains all the
|
// We want the WAL after because the replica already contains all the
|
||||||
// commits from this WAL
|
// commits from this WAL
|
||||||
--rwal_it;
|
--rwal_it;
|
||||||
|
}
|
||||||
std::vector<std::filesystem::path> wal_chain;
|
std::vector<std::filesystem::path> wal_chain;
|
||||||
auto distance_from_first = std::distance(rwal_it, wal_files->rend() - 1);
|
auto distance_from_first = std::distance(rwal_it, wal_files->rend() - 1);
|
||||||
// We have managed to create WAL chain
|
// We have managed to create WAL chain
|
||||||
|
@ -90,7 +90,7 @@ class Storage::ReplicationClient {
|
|||||||
rpc::Client::StreamHandler<CurrentWalRpc> stream_;
|
rpc::Client::StreamHandler<CurrentWalRpc> stream_;
|
||||||
};
|
};
|
||||||
|
|
||||||
bool StartTransactionReplication(uint64_t current_wal_seq_num);
|
void StartTransactionReplication(uint64_t current_wal_seq_num);
|
||||||
|
|
||||||
// Replication clients can be removed at any point
|
// Replication clients can be removed at any point
|
||||||
// so to avoid any complexity of checking if the client was removed whenever
|
// so to avoid any complexity of checking if the client was removed whenever
|
||||||
@ -155,6 +155,10 @@ class Storage::ReplicationClient {
|
|||||||
|
|
||||||
void InitializeClient();
|
void InitializeClient();
|
||||||
|
|
||||||
|
void TryInitializeClient();
|
||||||
|
|
||||||
|
void HandleRpcFailure();
|
||||||
|
|
||||||
std::string name_;
|
std::string name_;
|
||||||
|
|
||||||
Storage *storage_;
|
Storage *storage_;
|
||||||
@ -190,6 +194,19 @@ class Storage::ReplicationClient {
|
|||||||
std::optional<TimeoutDispatcher> timeout_dispatcher_;
|
std::optional<TimeoutDispatcher> timeout_dispatcher_;
|
||||||
|
|
||||||
utils::SpinLock client_lock_;
|
utils::SpinLock client_lock_;
|
||||||
|
// This thread pool is used for background tasks so we don't
|
||||||
|
// block the main storage thread
|
||||||
|
// We use only 1 thread for 2 reasons:
|
||||||
|
// - background tasks ALWAYS contain some kind of RPC communication.
|
||||||
|
// We can't have multiple RPC communication from a same client
|
||||||
|
// because that's not logically valid (e.g. you cannot send a snapshot
|
||||||
|
// and WAL at a same time because WAL will arrive earlier and be applied
|
||||||
|
// before the snapshot which is not correct)
|
||||||
|
// - the implementation is simplified as we have a total control of what
|
||||||
|
// this pool is executing. Also, we can simply queue multiple tasks
|
||||||
|
// and be sure of the execution order.
|
||||||
|
// Not having mulitple possible threads in the same client allows us
|
||||||
|
// to ignore concurrency problems inside the client.
|
||||||
utils::ThreadPool thread_pool_{1};
|
utils::ThreadPool thread_pool_{1};
|
||||||
std::atomic<replication::ReplicaState> replica_state_{
|
std::atomic<replication::ReplicaState> replica_state_{
|
||||||
replication::ReplicaState::INVALID};
|
replication::ReplicaState::INVALID};
|
||||||
|
@ -112,6 +112,13 @@ void Storage::ReplicationServer::AppendDeltasHandler(
|
|||||||
auto maybe_epoch_id = decoder.ReadString();
|
auto maybe_epoch_id = decoder.ReadString();
|
||||||
CHECK(maybe_epoch_id) << "Invalid replication message";
|
CHECK(maybe_epoch_id) << "Invalid replication message";
|
||||||
|
|
||||||
|
// Different epoch ids should not be possible in AppendDeltas
|
||||||
|
// because Recovery and Heartbeat handlers should resolve
|
||||||
|
// any issues with timestamp and epoch id
|
||||||
|
CHECK(*maybe_epoch_id == storage_->epoch_id_)
|
||||||
|
<< "Received Deltas from transaction with incompatible"
|
||||||
|
" epoch id";
|
||||||
|
|
||||||
const auto read_delta =
|
const auto read_delta =
|
||||||
[&]() -> std::pair<uint64_t, durability::WalDeltaData> {
|
[&]() -> std::pair<uint64_t, durability::WalDeltaData> {
|
||||||
try {
|
try {
|
||||||
@ -126,11 +133,6 @@ void Storage::ReplicationServer::AppendDeltasHandler(
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
// TODO (antonio2368): Add error handling for different epoch id
|
|
||||||
if (*maybe_epoch_id != storage_->epoch_id_) {
|
|
||||||
throw utils::BasicException("Invalid epoch id");
|
|
||||||
}
|
|
||||||
|
|
||||||
if (req.previous_commit_timestamp !=
|
if (req.previous_commit_timestamp !=
|
||||||
storage_->last_commit_timestamp_.load()) {
|
storage_->last_commit_timestamp_.load()) {
|
||||||
// Empty the stream
|
// Empty the stream
|
||||||
@ -532,7 +534,6 @@ void Storage::ReplicationServer::SnapshotHandler(slk::Reader *req_reader,
|
|||||||
storage_->edges_.clear();
|
storage_->edges_.clear();
|
||||||
|
|
||||||
storage_->constraints_ = Constraints();
|
storage_->constraints_ = Constraints();
|
||||||
// TODO (antonio2368): Check if there's a less hacky way
|
|
||||||
storage_->indices_.label_index = LabelIndex(
|
storage_->indices_.label_index = LabelIndex(
|
||||||
&storage_->indices_, &storage_->constraints_, storage_->config_.items);
|
&storage_->indices_, &storage_->constraints_, storage_->config_.items);
|
||||||
storage_->indices_.label_property_index = LabelPropertyIndex(
|
storage_->indices_.label_property_index = LabelPropertyIndex(
|
||||||
@ -558,8 +559,7 @@ void Storage::ReplicationServer::SnapshotHandler(slk::Reader *req_reader,
|
|||||||
recovered_snapshot.indices_constraints, &storage_->indices_,
|
recovered_snapshot.indices_constraints, &storage_->indices_,
|
||||||
&storage_->constraints_, &storage_->vertices_);
|
&storage_->constraints_, &storage_->vertices_);
|
||||||
} catch (const durability::RecoveryFailure &e) {
|
} catch (const durability::RecoveryFailure &e) {
|
||||||
// TODO (antonio2368): What to do if the sent snapshot is invalid
|
LOG(FATAL) << "Couldn't load the snapshot because of: " << e.what();
|
||||||
LOG(WARNING) << "Couldn't load the snapshot because of: " << e.what();
|
|
||||||
}
|
}
|
||||||
storage_->last_commit_timestamp_ = storage_->timestamp_ - 1;
|
storage_->last_commit_timestamp_ = storage_->timestamp_ - 1;
|
||||||
storage_guard.unlock();
|
storage_guard.unlock();
|
||||||
|
@ -2003,7 +2003,6 @@ utils::BasicResult<Storage::RegisterReplicaError> Storage::RegisterReplica(
|
|||||||
std::string name, io::network::Endpoint endpoint,
|
std::string name, io::network::Endpoint endpoint,
|
||||||
const replication::ReplicationMode replication_mode,
|
const replication::ReplicationMode replication_mode,
|
||||||
const replication::ReplicationClientConfig &config) {
|
const replication::ReplicationClientConfig &config) {
|
||||||
// TODO (antonio2368): This shouldn't stop the main instance
|
|
||||||
CHECK(replication_role_.load() == ReplicationRole::MAIN)
|
CHECK(replication_role_.load() == ReplicationRole::MAIN)
|
||||||
<< "Only main instance can register a replica!";
|
<< "Only main instance can register a replica!";
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user