From f85095c203503f2e1dbf907e7c23dc6b6d95e31f Mon Sep 17 00:00:00 2001 From: Matija Santl Date: Tue, 12 Feb 2019 08:45:52 +0100 Subject: [PATCH] Fix Raft shutdown Summary: During the following scenario: - start a HA cluster with 3 machines - find the leader and start sending queries - SIGTERM the leader but leave other 2 machines untouched The leader would be stuck in the shutdown phase. This was happening because during the shutdown phase of the Bolt server, a `graph_db_accessor` would try to commit a transaction after we've already shut down Raft server. Raft, although not running, is still thinking it's in the Leader mode. Tx Engine calls the `SafeToCommit` method to Commit transactions, and ends up in an infinite loop. Since Raft was shut down it won't handle any of the incoming RPCs and won't change it's mode. The fix here is to shut down the Bolt server before Raft, so we don't have any pending commits once Raft is shut down. Reviewers: ipaljak Reviewed By: ipaljak Subscribers: pullbot Differential Revision: https://phabricator.memgraph.io/D1853 --- src/database/single_node_ha/graph_db.cpp | 7 +++---- src/database/single_node_ha/graph_db.hpp | 1 - src/raft/raft_server.cpp | 3 ++- src/raft/raft_server.hpp | 2 +- src/transactions/single_node_ha/engine.cpp | 3 ++- tests/feature_benchmark/ha/runner.sh | 2 +- 6 files changed, 9 insertions(+), 9 deletions(-) diff --git a/src/database/single_node_ha/graph_db.cpp b/src/database/single_node_ha/graph_db.cpp index 40b96727d..4e0a953c5 100644 --- a/src/database/single_node_ha/graph_db.cpp +++ b/src/database/single_node_ha/graph_db.cpp @@ -37,19 +37,18 @@ void GraphDb::Start() { } } -GraphDb::~GraphDb() {} - void GraphDb::AwaitShutdown(std::function call_before_shutdown) { coordination_.AwaitShutdown([this, &call_before_shutdown]() { tx_engine_.LocalForEachActiveTransaction( [](auto &t) { t.set_should_abort(); }); call_before_shutdown(); + + raft_server_.Shutdown(); }); } void GraphDb::Shutdown() { - raft_server_.Shutdown(); coordination_.Shutdown(); } @@ -96,12 +95,12 @@ void GraphDb::CollectGarbage() { storage_gc_->CollectGarbage(); } void GraphDb::Reset() { // Release gc scheduler to stop it from touching storage. storage_gc_ = nullptr; - storage_ = std::make_unique(config_.properties_on_disk); // This will make all active transactions to abort and reset the internal // state. tx_engine_.Reset(); + storage_ = std::make_unique(config_.properties_on_disk); storage_gc_ = std::make_unique( *storage_, tx_engine_, &raft_server_, config_.gc_cycle_sec); } diff --git a/src/database/single_node_ha/graph_db.hpp b/src/database/single_node_ha/graph_db.hpp index e4e37bd97..aad71257e 100644 --- a/src/database/single_node_ha/graph_db.hpp +++ b/src/database/single_node_ha/graph_db.hpp @@ -89,7 +89,6 @@ class GraphDbAccessor; class GraphDb { public: explicit GraphDb(Config config = Config()); - ~GraphDb(); GraphDb(const GraphDb &) = delete; GraphDb(GraphDb &&) = delete; diff --git a/src/raft/raft_server.cpp b/src/raft/raft_server.cpp index d45634606..6c98c3a34 100644 --- a/src/raft/raft_server.cpp +++ b/src/raft/raft_server.cpp @@ -444,7 +444,7 @@ void RaftServer::GarbageCollectReplicationLog(const tx::TransactionId &tx_id) { rlog_->garbage_collect_older(tx_id); } -bool RaftServer::IsLeader() { return mode_ == Mode::LEADER; } +bool RaftServer::IsLeader() { return !exiting_ && mode_ == Mode::LEADER; } RaftServer::LogEntryBuffer::LogEntryBuffer(RaftServer *raft_server) : raft_server_(raft_server) { @@ -930,6 +930,7 @@ void RaftServer::PeerThreadMain(uint16_t peer_id) { } } + if (exiting_) break; state_changed_.wait_until(lock, wait_until); } } diff --git a/src/raft/raft_server.hpp b/src/raft/raft_server.hpp index 04c0cab10..bc163ae6d 100644 --- a/src/raft/raft_server.hpp +++ b/src/raft/raft_server.hpp @@ -195,7 +195,7 @@ class RaftServer final : public RaftInterface { ///< no_op_issuer_thread that a new ///< leader has been elected. - bool exiting_{false}; ///< True on server shutdown. + std::atomic exiting_{false}; ///< True on server shutdown. ////////////////////////////////////////////////////////////////////////////// // volatile state on followers and candidates diff --git a/src/transactions/single_node_ha/engine.cpp b/src/transactions/single_node_ha/engine.cpp index bc16b2104..07e9b3a6b 100644 --- a/src/transactions/single_node_ha/engine.cpp +++ b/src/transactions/single_node_ha/engine.cpp @@ -30,7 +30,8 @@ Transaction *Engine::BeginBlocking( { std::lock_guard guard(lock_); if (!accepting_transactions_.load()) - throw TransactionEngineError("Engine is not accepting new transactions"); + throw TransactionEngineError( + "The transaction engine currently isn't accepting new transactions."); // Block the engine from accepting new transactions. accepting_transactions_.store(false); diff --git a/tests/feature_benchmark/ha/runner.sh b/tests/feature_benchmark/ha/runner.sh index 1f3be8f7b..4fecbaaa5 100755 --- a/tests/feature_benchmark/ha/runner.sh +++ b/tests/feature_benchmark/ha/runner.sh @@ -63,7 +63,7 @@ code=$? # Shutdown for server_id in 1 2 3 do - kill -9 ${HA_PIDS[$server_id]} + kill -15 ${HA_PIDS[$server_id]} done # Cleanup