From f85095c203503f2e1dbf907e7c23dc6b6d95e31f Mon Sep 17 00:00:00 2001
From: Matija Santl <matija.santl@memgraph.com>
Date: Tue, 12 Feb 2019 08:45:52 +0100
Subject: [PATCH] Fix Raft shutdown

Summary:
During the following scenario:
 - start a HA cluster with 3 machines
 - find the leader and start sending queries
 - SIGTERM the leader but leave other 2 machines untouched

The leader would be stuck in the shutdown phase.

This was happening because during the shutdown phase of the Bolt server, a
`graph_db_accessor` would try to commit a transaction after we've already shut
down Raft server.  Raft, although not running, is still thinking it's in the
Leader mode. Tx Engine calls the `SafeToCommit` method to Commit transactions,
and ends up in an infinite loop.

Since Raft was shut down it won't handle any of the incoming RPCs and won't
change it's mode.

The fix here is to shut down the Bolt server before Raft, so we don't have any
pending commits once Raft is shut down.

Reviewers: ipaljak

Reviewed By: ipaljak

Subscribers: pullbot

Differential Revision: https://phabricator.memgraph.io/D1853
---
 src/database/single_node_ha/graph_db.cpp   | 7 +++----
 src/database/single_node_ha/graph_db.hpp   | 1 -
 src/raft/raft_server.cpp                   | 3 ++-
 src/raft/raft_server.hpp                   | 2 +-
 src/transactions/single_node_ha/engine.cpp | 3 ++-
 tests/feature_benchmark/ha/runner.sh       | 2 +-
 6 files changed, 9 insertions(+), 9 deletions(-)
diff --git a/src/database/single_node_ha/graph_db.cpp b/src/database/single_node_ha/graph_db.cpp
index 40b96727d..4e0a953c5 100644
--- a/src/database/single_node_ha/graph_db.cpp
+++ b/src/database/single_node_ha/graph_db.cpp
@@ -37,19 +37,18 @@ void GraphDb::Start() {
   }
 }
 
-GraphDb::~GraphDb() {}
-
 void GraphDb::AwaitShutdown(std::function<void(void)> call_before_shutdown) {
   coordination_.AwaitShutdown([this, &call_before_shutdown]() {
     tx_engine_.LocalForEachActiveTransaction(
         [](auto &t) { t.set_should_abort(); });
 
     call_before_shutdown();
+
+    raft_server_.Shutdown();
   });
 }
 
 void GraphDb::Shutdown() {
-  raft_server_.Shutdown();
   coordination_.Shutdown();
 }
 
@@ -96,12 +95,12 @@ void GraphDb::CollectGarbage() { storage_gc_->CollectGarbage(); }
 void GraphDb::Reset() {
   // Release gc scheduler to stop it from touching storage.
   storage_gc_ = nullptr;
-  storage_ = std::make_unique<Storage>(config_.properties_on_disk);
 
   // This will make all active transactions to abort and reset the internal
   // state.
   tx_engine_.Reset();
 
+  storage_ = std::make_unique<Storage>(config_.properties_on_disk);
   storage_gc_ = std::make_unique<StorageGc>(
       *storage_, tx_engine_, &raft_server_, config_.gc_cycle_sec);
 }
diff --git a/src/database/single_node_ha/graph_db.hpp b/src/database/single_node_ha/graph_db.hpp
index e4e37bd97..aad71257e 100644
--- a/src/database/single_node_ha/graph_db.hpp
+++ b/src/database/single_node_ha/graph_db.hpp
@@ -89,7 +89,6 @@ class GraphDbAccessor;
 class GraphDb {
  public:
   explicit GraphDb(Config config = Config());
-  ~GraphDb();
 
   GraphDb(const GraphDb &) = delete;
   GraphDb(GraphDb &&) = delete;
diff --git a/src/raft/raft_server.cpp b/src/raft/raft_server.cpp
index d45634606..6c98c3a34 100644
--- a/src/raft/raft_server.cpp
+++ b/src/raft/raft_server.cpp
@@ -444,7 +444,7 @@ void RaftServer::GarbageCollectReplicationLog(const tx::TransactionId &tx_id) {
   rlog_->garbage_collect_older(tx_id);
 }
 
-bool RaftServer::IsLeader() { return mode_ == Mode::LEADER; }
+bool RaftServer::IsLeader() { return !exiting_ && mode_ == Mode::LEADER; }
 
 RaftServer::LogEntryBuffer::LogEntryBuffer(RaftServer *raft_server)
     : raft_server_(raft_server) {
@@ -930,6 +930,7 @@ void RaftServer::PeerThreadMain(uint16_t peer_id) {
       }
     }
 
+    if (exiting_) break;
     state_changed_.wait_until(lock, wait_until);
   }
 }
diff --git a/src/raft/raft_server.hpp b/src/raft/raft_server.hpp
index 04c0cab10..bc163ae6d 100644
--- a/src/raft/raft_server.hpp
+++ b/src/raft/raft_server.hpp
@@ -195,7 +195,7 @@ class RaftServer final : public RaftInterface {
                                             ///< no_op_issuer_thread that a new
                                             ///< leader has been elected.
 
-  bool exiting_{false};  ///< True on server shutdown.
+  std::atomic<bool> exiting_{false};  ///< True on server shutdown.
 
   //////////////////////////////////////////////////////////////////////////////
   // volatile state on followers and candidates
diff --git a/src/transactions/single_node_ha/engine.cpp b/src/transactions/single_node_ha/engine.cpp
index bc16b2104..07e9b3a6b 100644
--- a/src/transactions/single_node_ha/engine.cpp
+++ b/src/transactions/single_node_ha/engine.cpp
@@ -30,7 +30,8 @@ Transaction *Engine::BeginBlocking(
   {
     std::lock_guard<utils::SpinLock> guard(lock_);
     if (!accepting_transactions_.load())
-      throw TransactionEngineError("Engine is not accepting new transactions");
+      throw TransactionEngineError(
+          "The transaction engine currently isn't accepting new transactions.");
 
     // Block the engine from accepting new transactions.
     accepting_transactions_.store(false);
diff --git a/tests/feature_benchmark/ha/runner.sh b/tests/feature_benchmark/ha/runner.sh
index 1f3be8f7b..4fecbaaa5 100755
--- a/tests/feature_benchmark/ha/runner.sh
+++ b/tests/feature_benchmark/ha/runner.sh
@@ -63,7 +63,7 @@ code=$?
 # Shutdown
 for server_id in 1 2 3
 do
-  kill -9 ${HA_PIDS[$server_id]}
+  kill -15 ${HA_PIDS[$server_id]}
 done
 
 # Cleanup