Implement log replication in Raft

Reviewers: msantl Reviewed By: msantl Subscribers: pullbot Differential Revision: https://phabricator.memgraph.io/D1782
2019-01-04 15:27:41 +01:00 · 2019-01-04 15:27:41 +01:00 · cc3192cef7
commit cc3192cef7
parent 363cdb8b88
3 changed files with 302 additions and 84 deletions
--- a/src/raft/log_entry.lcp
+++ b/src/raft/log_entry.lcp
@ -15,6 +15,10 @@ cpp<#
 (lcp:define-struct log-entry ()
  ((term :uint64_t)
   (deltas "std::vector<database::StateDelta>" :capnp-type "List(Database.StateDelta)"))
+   (:public #>cpp
+   LogEntry() = default;
+   LogEntry(uint64_t _term, std::vector<database::StateDelta> _deltas): term(_term), deltas(_deltas) {}
+   cpp<#)
  (:serialize (:slk) (:capnp)))

 (lcp:pop-namespace) ;; raft
--- a/src/raft/raft_server.cpp
+++ b/src/raft/raft_server.cpp
@ -31,19 +31,21 @@ RaftServer::RaftServer(uint16_t server_id, const std::string &durability_dir,
      rlog_(std::make_unique<ReplicationLog>()),
      mode_(Mode::FOLLOWER),
      server_id_(server_id),
+      commit_index_(0),
+      last_applied_(0),
      disk_storage_(fs::path(durability_dir) / kRaftDir),
      reset_callback_(reset_callback),
      no_op_create_callback_(no_op_create_callback) {}

 void RaftServer::Start() {
-  // Persistent storage initialization/recovery.
+  // Persistent storage initialization
  if (Log().empty()) {
    UpdateTerm(0);
-  } else {
-    Recover();
+    LogEntry empty_log_entry(0, {});
+    AppendLogEntries(0, 0, {empty_log_entry});
  }

-  // Peer state
+  // Peer state initialization
  int cluster_size = coordination_->WorkerCount() + 1;
  next_index_.resize(cluster_size);
  match_index_.resize(cluster_size);
@ -61,7 +63,7 @@ void RaftServer::Start() {
        // "If a server recieves a request with a stale term,
        // it rejects the request"
        uint64_t current_term = CurrentTerm();
-        if (req.term < current_term) {
+        if (exiting_ || req.term < current_term) {
          RequestVoteRes res(false, current_term);
          Save(res, res_builder);
          return;
@ -83,11 +85,12 @@ void RaftServer::Start() {
        // up-to-date than that of the candidate"
        std::experimental::optional<uint16_t> voted_for = VotedFor();
        auto last_entry_data = LastEntryData();
-        RequestVoteRes res(
+        bool grant_vote =
            (!voted_for || voted_for.value() == req.candidate_id) &&
-                AtLeastUpToDate(req.last_log_index, req.last_log_term,
-                                last_entry_data.first, last_entry_data.second),
-            current_term);
+            AtLeastUpToDate(req.last_log_index, req.last_log_term,
+                            last_entry_data.first, last_entry_data.second);
+        RequestVoteRes res(grant_vote, current_term);
+        if (grant_vote) SetNextElectionTimePoint();
        Save(res, res_builder);
      });

@ -115,8 +118,16 @@ void RaftServer::Start() {
      if (mode_ != Mode::FOLLOWER) Transition(Mode::FOLLOWER);
    }

+    // [Raft paper 5.3]
+    // "Once a follower learns that a log entry is committed, it applies
+    // the entry to its state machine (in log order)
+    while (req.leader_commit > last_applied_ &&
+           last_applied_ + 1 < Log().size()) {
+      ++last_applied_;
+      delta_applier_->Apply(Log()[last_applied_].deltas);
+    }
+
    // respond positively to a heartbeat.
-    // TODO(ipaljak) review this when implementing log replication.
    if (req.entries.empty()) {
      AppendEntriesRes res(true, current_term);
      Save(res, res_builder);
@ -129,11 +140,9 @@ void RaftServer::Start() {
      return;
    }

-    throw utils::NotYetImplemented("AppendEntriesRpc which is not a heartbeat");
-
    // [Raft paper 5.3]
    // "If a follower's log is inconsistent with the leader's, the
-    // consistency check will fail in the next AppendEntries RPC."
+    // consistency check will fail in the AppendEntries RPC."
    //
    // Consistency checking assures the Log Matching Property:
    //   - If two entries in different logs have the same index and
@ -141,24 +150,20 @@ void RaftServer::Start() {
    //   - If two entries in different logs have the same index and term,
    //     then the logs are identical in all preceding entries.
    auto log = Log();
-    if (log.size() < req.prev_log_index ||
-        log[req.prev_log_index - 1].term != req.prev_log_term) {
+    if (log.size() <= req.prev_log_index ||
+        log[req.prev_log_index].term != req.prev_log_term) {
      AppendEntriesRes res(false, current_term);
      Save(res, res_builder);
      return;
    }

-    // If existing entry conflicts with new one, we need to delete the
-    // existing entry and all that follow it.
-    if (log.size() > req.prev_log_index &&
-        log[req.prev_log_index].term != req.entries[0].term)
-      DeleteLogSuffix(req.prev_log_index + 1);
-
-    AppendLogEntries(req.leader_commit, req.entries);
+    AppendLogEntries(req.leader_commit, req.prev_log_index + 1, req.entries);
    AppendEntriesRes res(true, current_term);
    Save(res, res_builder);
  });

+  // start threads
+
  SetNextElectionTimePoint();
  election_thread_ = std::thread(&RaftServer::ElectionThreadMain, this);

@ -166,6 +171,8 @@ void RaftServer::Start() {
    if (peer_id == server_id_) continue;
    peer_threads_.emplace_back(&RaftServer::PeerThreadMain, this, peer_id);
  }
+
+  no_op_issuer_thread_ = std::thread(&RaftServer::NoOpIssuerThreadMain, this);
 }

 void RaftServer::Shutdown() {
@ -175,6 +182,7 @@ void RaftServer::Shutdown() {

    state_changed_.notify_all();
    election_change_.notify_all();
+    leader_changed_.notify_all();
  }

  for (auto &peer_thread : peer_threads_) {
@ -182,6 +190,7 @@ void RaftServer::Shutdown() {
  }

  if (election_thread_.joinable()) election_thread_.join();
+  if (no_op_issuer_thread_.joinable()) no_op_issuer_thread_.join();
 }

 uint64_t RaftServer::CurrentTerm() {
@ -208,9 +217,26 @@ std::vector<LogEntry> RaftServer::Log() {
 }

 void RaftServer::AppendToLog(const tx::TransactionId &tx_id,
-                             const std::vector<database::StateDelta> &log) {
+                             const std::vector<database::StateDelta> &deltas) {
+  std::unique_lock<std::mutex> lock(lock_);
+  DCHECK(mode_ == Mode::LEADER)
+      << "`AppendToLog` should only be called in LEADER mode";
+  if (deltas.size() == 2) {
+    DCHECK(deltas[0].type == database::StateDelta::Type::TRANSACTION_BEGIN &&
+           deltas[1].type == database::StateDelta::Type::TRANSACTION_COMMIT)
+        << "Transactions with two state deltas must be reads (start with BEGIN "
+           "and end with COMMIT)";
+    rlog_->set_replicated(tx_id);
+    return;
+  }
+  auto log = Log();
+  DCHECK(last_applied_ == log.size() - 1) << "Everything from the leaders log "
+                                             "should be applied into our state "
+                                             "machine";
  rlog_->set_active(tx_id);
-  throw utils::NotYetImplemented("RaftServer replication");
+  log.emplace_back(CurrentTerm(), deltas);
+  ++last_applied_;
+  disk_storage_.Put(kLogKey, SerializeLog(log));
 }

 void RaftServer::Emplace(const database::StateDelta &delta) {
@ -218,8 +244,6 @@ void RaftServer::Emplace(const database::StateDelta &delta) {
 }

 bool RaftServer::SafeToCommit(const tx::TransactionId &tx_id) {
-  std::unique_lock<std::mutex> lock(lock_);
-
  switch (mode_) {
    case Mode::FOLLOWER:
      // When in follower mode, we will only try to apply a Raft Log when we
@ -242,25 +266,29 @@ bool RaftServer::SafeToCommit(const tx::TransactionId &tx_id) {
  throw InvalidReplicationLogLookup();
 }

+void RaftServer::GarbageCollectReplicationLog(const tx::TransactionId &tx_id) {
+  rlog_->garbage_collect_older(tx_id);
+}
+
 RaftServer::LogEntryBuffer::LogEntryBuffer(RaftServer *raft_server)
    : raft_server_(raft_server) {
  CHECK(raft_server_) << "RaftServer can't be nullptr";
 }

 void RaftServer::LogEntryBuffer::Enable() {
-  std::lock_guard<std::mutex> guard(lock_);
+  std::lock_guard<std::mutex> guard(buffer_lock_);
  enabled_ = true;
 }

 void RaftServer::LogEntryBuffer::Disable() {
-  std::lock_guard<std::mutex> guard(lock_);
+  std::lock_guard<std::mutex> guard(buffer_lock_);
  enabled_ = false;
  // Clear all existing logs from buffers.
  logs_.clear();
 }

 void RaftServer::LogEntryBuffer::Emplace(const database::StateDelta &delta) {
-  std::lock_guard<std::mutex> guard(lock_);
+  std::lock_guard<std::mutex> guard(buffer_lock_);
  if (!enabled_) return;

  tx::TransactionId tx_id = delta.transaction_id;
@ -271,6 +299,7 @@ void RaftServer::LogEntryBuffer::Emplace(const database::StateDelta &delta) {
    std::vector<database::StateDelta> log(std::move(it->second));
    log.emplace_back(std::move(delta));
    logs_.erase(it);
+    raft_server_->AppendToLog(tx_id, log);

    // Make sure that this wasn't a read query (contains transaction begin and
    // commit).
@ -279,8 +308,6 @@ void RaftServer::LogEntryBuffer::Emplace(const database::StateDelta &delta) {
          << "Raft log of size two doesn't start with TRANSACTION_BEGIN";
      return;
    }
-
-    raft_server_->AppendToLog(tx_id, log);
  } else if (delta.type == database::StateDelta::Type::TRANSACTION_ABORT) {
    auto it = logs_.find(tx_id);
    CHECK(it != logs_.end()) << "Missing StateDeltas for transaction " << tx_id;
@ -344,8 +371,6 @@ void RaftServer::Transition(const Mode &new_mode) {
    case Mode::LEADER: {
      LOG(INFO) << "Server " << server_id_
                << ": Transition to LEADER (Term: " << CurrentTerm() << ")";
-      log_entry_buffer_.Enable();
-
      // Freeze election timer
      next_election_ = TimePoint::max();
      election_change_.notify_all();
@ -355,12 +380,25 @@ void RaftServer::Transition(const Mode &new_mode) {
      for (auto &peer_heartbeat : next_heartbeat_)
        peer_heartbeat = now + config_.heartbeat_interval;

-      mode_ = Mode::LEADER;
+      // [Raft paper figure 2]
+      // "For each server, index of the next log entry to send to that server
+      // is initialized to leader's last log index + 1"
+      for (int i = 1; i < coordination_->WorkerCount() + 1; ++i) {
+        next_index_[i] = Log().size();
+        match_index_[i] = 0;
+      }

-      // no_op_create_callback_ will create a new transaction that has a NO_OP
-      // StateDelta. This will trigger the whole procedure of replicating logs
-      // in our implementation of Raft.
-      no_op_create_callback_();
+      // Raft guarantees the Leader Append-Only property [Raft paper 5.2]
+      // so its safe to apply everything from our log into our state machine
+      auto log = Log();
+      for (int i = last_applied_ + 1; i < log.size(); ++i)
+        delta_applier_->Apply(log[i].deltas);
+      last_applied_ = log.size() - 1;
+
+      mode_ = Mode::LEADER;
+      log_entry_buffer_.Enable();
+
+      leader_changed_.notify_all();
      break;
    }
  }
@ -371,8 +409,130 @@ void RaftServer::UpdateTerm(uint64_t new_term) {
  disk_storage_.Delete(kVotedForKey);
 }

+void RaftServer::AdvanceCommitIndex() {
+  DCHECK(mode_ == Mode::LEADER)
+      << "Commit index can only be advanced by the leader";
+
+  std::vector<uint64_t> known_replication_indices;
+  for (int i = 1; i < coordination_->WorkerCount() + 1; ++i) {
+    if (i != server_id_)
+      known_replication_indices.push_back(match_index_[i]);
+    else
+      known_replication_indices.push_back(Log().size() - 1);
+  }
+
+  std::sort(known_replication_indices.begin(), known_replication_indices.end());
+  uint64_t new_commit_index =
+      known_replication_indices[(coordination_->WorkerCount() - 1) / 2];
+
+  // This can happen because we reset `match_index` vector to 0 after a
+  // new leader has been elected.
+  if (commit_index_ >= new_commit_index) return;
+
+  // [Raft thesis, section 3.6.2]
+  // "(...) Raft never commits log entries from previous terms by counting
+  // replicas. Only log entries from the leader's current term are committed by
+  // counting replicas; once an entry from the current term has been committed
+  // in this way, then all prior entries are committed indirectly because of the
+  // Log Matching Property."
+  if (Log()[new_commit_index].term != CurrentTerm()) {
+    LOG(INFO) << "Server " << server_id_ << ": cannot commit log entry from "
+                                            "previous term based on "
+                                            "replication count.";
+    return;
+  }
+
+  LOG(INFO) << "Begin noting comimitted transactions";
+
+  // Note the newly committed transactions in ReplicationLog
+  std::set<tx::TransactionId> replicated_tx_ids;
+  auto log = Log();
+  for (int i = commit_index_ + 1; i <= new_commit_index; ++i) {
+    for (const auto &state_delta : log[i].deltas) {
+      replicated_tx_ids.insert(state_delta.transaction_id);
+    }
+  }
+
+  for (const auto &tx_id : replicated_tx_ids)
+    rlog_->set_replicated(tx_id);
+
+  commit_index_ = new_commit_index;
+}
+
 void RaftServer::Recover() {
-  throw utils::NotYetImplemented("RaftServer recover");
+  throw utils::NotYetImplemented("RaftServer Recover");
+}
+
+void RaftServer::SendEntries(uint16_t peer_id,
+                             std::unique_lock<std::mutex> &lock) {
+  uint64_t request_term = CurrentTerm();
+  uint64_t request_prev_log_index = next_index_[peer_id] - 1;
+  uint64_t request_prev_log_term = Log()[next_index_[peer_id] - 1].term;
+
+  std::vector<raft::LogEntry> request_entries;
+  if (next_index_[peer_id] <= Log().size() - 1)
+    GetLogSuffix(next_index_[peer_id], request_entries);
+
+  bool unreachable_peer = false;
+  auto peer_future = coordination_->ExecuteOnWorker<AppendEntriesRes>(
+      peer_id, [&](int worker_id, auto &client) {
+        try {
+        auto res = client.template Call<AppendEntriesRpc>(
+            server_id_, commit_index_, request_term, request_prev_log_index,
+            request_prev_log_term, request_entries);
+        return res;
+        } catch (...) {
+          // not being able to connect to peer means we need to retry.
+          // TODO(ipaljak): Consider backoff.
+          unreachable_peer = true;
+          return AppendEntriesRes(false, request_term);
+        }
+      });
+
+  LOG(INFO) << "Entries size: " << request_entries.size();
+
+  lock.unlock(); // Release lock while waiting for response.
+  auto reply = peer_future.get();
+  lock.lock();
+
+  if (unreachable_peer) {
+    next_heartbeat_[peer_id] = Clock::now() + config_.heartbeat_interval;
+    return;
+  }
+
+  if (CurrentTerm() != request_term || exiting_) {
+    return;
+  }
+
+  if (OutOfSync(reply.term)) {
+    state_changed_.notify_all();
+    return;
+  }
+
+  DCHECK(mode_ == Mode::LEADER)
+      << "Elected leader for term should never change.";
+
+  if (reply.term != CurrentTerm()) {
+    LOG(INFO) << "Server " << server_id_
+              << ": Ignoring stale AppendEntriesRPC reply from " << peer_id;
+    return;
+  }
+
+  if (!reply.success) {
+    DCHECK(next_index_[peer_id] > 1)
+        << "Log replication should not fail for first log entry";
+    --next_index_[peer_id];
+  } else {
+    uint64_t new_match_index = request_prev_log_index + request_entries.size();
+    DCHECK(match_index_[peer_id] <= new_match_index)
+        << "`match_index` should increase monotonically within a term";
+    match_index_[peer_id] = new_match_index;
+    if (request_entries.size() > 0) AdvanceCommitIndex();
+    next_index_[peer_id] = match_index_[peer_id] + 1;
+    next_heartbeat_[peer_id] = Clock::now() + config_.heartbeat_interval;
+  }
+
+  state_changed_.notify_all();
 }

 void RaftServer::ElectionThreadMain() {
@ -389,7 +549,7 @@ void RaftServer::ElectionThreadMain() {
  }
 }

-void RaftServer::PeerThreadMain(int peer_id) {
+void RaftServer::PeerThreadMain(uint16_t peer_id) {
  std::unique_lock<std::mutex> lock(lock_);

  /* This loop will either call a function that issues an RPC or wait on the
@ -436,6 +596,7 @@ void RaftServer::PeerThreadMain(int peer_id) {
                  return RequestVoteRes(false, request_term);
                }
              });
+
          lock.unlock();  // Release lock while waiting for response
          auto reply = peer_future.get();
          lock.lock();
@ -470,21 +631,10 @@ void RaftServer::PeerThreadMain(int peer_id) {

        case Mode::LEADER: {
          if (now >= next_heartbeat_[peer_id]) {
-            LOG(INFO) << "Server " << server_id_ << ": Send HB to server "
-                      << peer_id << " (Term: " << CurrentTerm() << ")";
-            auto peer_future = coordination_->ExecuteOnWorker<AppendEntriesRes>(
-                peer_id, [&](int worker_id, auto &client) {
-                  auto last_entry_data = LastEntryData();
-                  std::vector<raft::LogEntry> empty_entries;
-                  auto res = client.template Call<AppendEntriesRpc>(
-                      server_id_, commit_index_, CurrentTerm(),
-                      last_entry_data.first, last_entry_data.second,
-                      empty_entries);
-                  return res;
-                });
-            next_heartbeat_[peer_id] =
-                Clock::now() + config_.heartbeat_interval;
-            state_changed_.notify_all();
+            LOG(INFO) << "Server " << server_id_
+                      << ": Send AppendEntries RPC to server " << peer_id
+                      << " (Term: " << CurrentTerm() << ")";
+            SendEntries(peer_id, lock);
            continue;
          }
          wait_until = next_heartbeat_[peer_id];
@ -497,6 +647,19 @@ void RaftServer::PeerThreadMain(int peer_id) {
  }
 }

+void RaftServer::NoOpIssuerThreadMain() {
+  std::mutex m;
+  auto lock = std::unique_lock<std::mutex>(m);
+  while (!exiting_) {
+    leader_changed_.wait(lock);
+    // no_op_create_callback_ will create a new transaction that has a NO_OP
+    // StateDelta. This will trigger the whole procedure of replicating logs
+    // in our implementation of Raft.
+    if (!exiting_)
+      no_op_create_callback_();
+  }
+}
+
 void RaftServer::SetNextElectionTimePoint() {
  // [Raft thesis, section 3.4]
  // "Raft uses randomized election timeouts to ensure that split votes are
@ -555,14 +718,32 @@ bool RaftServer::OutOfSync(uint64_t reply_term) {

 void RaftServer::DeleteLogSuffix(int starting_index) {
  auto log = Log();
-  log.erase(log.begin() + starting_index - 1, log.end());  // 1-indexed
+  log.erase(log.begin() + starting_index, log.end());
  disk_storage_.Put(kLogKey, SerializeLog(log));
 }

+void RaftServer::GetLogSuffix(int starting_index,
+                              std::vector<raft::LogEntry> &entries) {
+  auto log = Log();
+  for (int i = starting_index; i < log.size(); ++i) entries.push_back(log[i]);
+}
+
 void RaftServer::AppendLogEntries(uint64_t leader_commit_index,
+                                  uint64_t starting_index,
                                  const std::vector<LogEntry> &new_entries) {
  auto log = Log();
-  for (auto &entry : new_entries) log.emplace_back(entry);
+  for (int i = 0; i < new_entries.size(); ++i) {
+    // If existing entry conflicts with new one, we need to delete the
+    // existing entry and all that follow it.
+    int current_index = i + starting_index;
+    if (log.size() > current_index &&
+        log[current_index].term != new_entries[i].term)
+      DeleteLogSuffix(current_index);
+    DCHECK(log.size() >= current_index);
+    if (log.size() == current_index)
+      log.emplace_back(new_entries[i]);
+  }
+
  // See Raft paper 5.3
  if (leader_commit_index > commit_index_) {
    commit_index_ = std::min(leader_commit_index, log.size() - 1);
@ -571,25 +752,26 @@ void RaftServer::AppendLogEntries(uint64_t leader_commit_index,
 }

 std::string RaftServer::SerializeLog(const std::vector<LogEntry> &log) {
-  ::capnp::MallocMessageBuilder message;
-  auto log_builder =
-      message.initRoot<::capnp::List<capnp::LogEntry>>(log.size());
-  utils::SaveVector<capnp::LogEntry, LogEntry>(
-      log, &log_builder, [](auto *log_builder, const auto &log_entry) {
-        Save(log_entry, log_builder);
-      });
-
  std::stringstream stream(std::ios_base::in | std::ios_base::out |
                           std::ios_base::binary);
-  kj::std::StdOutputStream std_stream(stream);
-  kj::BufferedOutputStreamWrapper buffered_stream(std_stream);
-  writeMessage(buffered_stream, message);
+  {
+    ::capnp::MallocMessageBuilder message;
+    ::capnp::List<capnp::LogEntry>::Builder log_builder =
+        message.initRoot<::capnp::List<capnp::LogEntry>>(log.size());
+    utils::SaveVector<capnp::LogEntry, LogEntry>(
+        log, &log_builder, [](auto *log_builder, const auto &log_entry) {
+          Save(log_entry, log_builder);
+        });
+
+    kj::std::StdOutputStream std_stream(stream);
+    kj::BufferedOutputStreamWrapper buffered_stream(std_stream);
+    writeMessage(buffered_stream, message);
+  }
  return stream.str();
 }

 std::vector<LogEntry> RaftServer::DeserializeLog(
    const std::string &serialized_log) {
-  if (serialized_log.empty()) return {};
  ::capnp::MallocMessageBuilder message;
  std::stringstream stream(std::ios_base::in | std::ios_base::out |
                           std::ios_base::binary);
@ -610,8 +792,9 @@ std::vector<LogEntry> RaftServer::DeserializeLog(
  return deserialized_log;
 }

-void RaftServer::GarbageCollectReplicationLog(const tx::TransactionId &tx_id) {
-  rlog_->garbage_collect_older(tx_id);
+void RaftServer::ResetReplicationLog() {
+  rlog_ = nullptr;
+  rlog_ = std::make_unique<ReplicationLog>();
 }

 }  // namespace raft
--- a/src/raft/raft_server.hpp
+++ b/src/raft/raft_server.hpp
@ -2,6 +2,7 @@

 #pragma once

+#include <atomic>
 #include <mutex>
 #include <unordered_map>
 #include <vector>
@ -51,7 +52,7 @@ class RaftServer final : public RaftInterface {
  /// @param durbility_dir directory for persisted data.
  /// @param config raft configuration.
  /// @param coordination Abstraction for coordination between Raft servers.
-  /// @param delta_applier TODO
+  /// @param delta_applier Object which is able to apply state deltas to SM.
  /// @param reset_callback Function that is called on each Leader->Follower
  ///                       transition.
  RaftServer(uint16_t server_id, const std::string &durability_dir,
@ -81,10 +82,10 @@ class RaftServer final : public RaftInterface {
  /// persistent storage, an empty Log will be created.
  std::vector<LogEntry> Log();

-  /// Append the log to the list of completed logs that are ready to be
+  /// Append to the log a list of batched state deltasa that are ready to be
  /// replicated.
  void AppendToLog(const tx::TransactionId &tx_id,
-                   const std::vector<database::StateDelta> &log);
+                   const std::vector<database::StateDelta> &deltas);

  /// Emplace a single StateDelta to the corresponding batch. If the StateDelta
  /// marks the transaction end, it will replicate the log accorss the cluster.
@ -126,7 +127,7 @@ class RaftServer final : public RaftInterface {

   private:
    bool enabled_{false};
-    mutable std::mutex lock_;
+    mutable std::mutex buffer_lock_;
    std::unordered_map<tx::TransactionId, std::vector<database::StateDelta>>
        logs_;

@ -144,7 +145,7 @@ class RaftServer final : public RaftInterface {
  database::StateDeltaApplier *delta_applier_{nullptr};
  std::unique_ptr<ReplicationLog> rlog_{nullptr};

-  Mode mode_;              ///< Server's current mode.
+  std::atomic<Mode> mode_;              ///< Server's current mode.
  uint16_t server_id_;     ///< ID of the current server.
  uint64_t commit_index_;  ///< Index of the highest known committed entry.
  uint64_t last_applied_;  ///< Index of the highest applied entry to SM.
@ -162,6 +163,13 @@ class RaftServer final : public RaftInterface {
  std::condition_variable state_changed_;  ///< Notifies all peer threads on
                                           ///< relevant state change.

+  std::thread no_op_issuer_thread_; ///< Thread responsible for issuing no-op
+                                    ///< command on leader change.
+
+  std::condition_variable leader_changed_; ///< Notifies the no_op_issuer_thread
+                                           ///< that a new leader has been
+                                           ///< elected.
+
  bool exiting_ = false;  ///< True on server shutdown.

  //////////////////////////////////////////////////////////////////////////////
@ -187,10 +195,10 @@ class RaftServer final : public RaftInterface {
  // volatile state on leaders
  //////////////////////////////////////////////////////////////////////////////

-  std::vector<uint16_t> next_index_;  ///< for each server, index of the next
+  std::vector<uint64_t> next_index_;  ///< for each server, index of the next
                                      ///< log entry to send to that server.

-  std::vector<uint16_t> match_index_;  ///< for each server, index of the
+  std::vector<uint64_t> match_index_;  ///< for each server, index of the
                                       ///< highest log entry known to be
                                       ///< replicated on server.

@ -225,10 +233,21 @@ class RaftServer final : public RaftInterface {
  /// Updates the current term.
  void UpdateTerm(uint64_t new_term);

+  /// Tries to advance the commit index on a leader.
+  void AdvanceCommitIndex();
+
  /// Recovers from persistent storage. This function should be called from
  /// the constructor before the server starts with normal operation.
  void Recover();

+  /// Sends Entries to peer. This function should only be called in leader
+  /// mode.
+  ///
+  /// @param peer_id ID of the peer which receives entries.
+  /// @param lock Lock from the peer thread (released while waiting for
+  ///             response)
+  void SendEntries(uint16_t peer_id, std::unique_lock<std::mutex> &lock);
+
  /// Main function of the `election_thread_`. It is responsible for
  /// transition to CANDIDATE mode when election timeout elapses.
  void ElectionThreadMain();
@ -237,7 +256,12 @@ class RaftServer final : public RaftInterface {
  /// specified node within the Raft cluster.
  ///
  /// @param peer_id - ID of a receiving node in the cluster.
-  void PeerThreadMain(int peer_id);
+  void PeerThreadMain(uint16_t peer_id);
+
+  /// Issues no-op command when a new leader is elected. This is done to
+  /// force the Raft protocol to commit logs from previous terms that
+  /// have been replicated on a majority of peers.
+  void NoOpIssuerThreadMain();

  /// Sets the `TimePoint` for next election.
  void SetNextElectionTimePoint();
@ -278,14 +302,24 @@ class RaftServer final : public RaftInterface {
  ///                       1-indexed.
  void DeleteLogSuffix(int starting_index);

+  /// Stores log entries with indexes that are greater or equal to the given
+  /// starting index into a provided container. If the starting index is
+  /// greater than the log size, nothing will be stored in the provided
+  /// container.
+  ///
+  /// @param starting_index Smallest index which will be stored.
+  /// @param entries The container which will store the wanted suffix.
+  void GetLogSuffix(int starting_index, std::vector<raft::LogEntry> &entries);
+
  /// Appends new log entries to Raft log. Note that this function is not
  /// smart in any way, i.e. the caller should make sure that it's safe
  /// to call this function. This function also updates this server's commit
  /// index if necessary.
  ///
  /// @param leader_commit_index - Used to update local commit index.
+  /// @param starting_index - Index in the log from which we start to append.
  /// @param new_entries - New `LogEntry` instances to be appended in the log.
-  void AppendLogEntries(uint64_t leader_commit_index,
+  void AppendLogEntries(uint64_t leader_commit_index, uint64_t starting_index,
                        const std::vector<LogEntry> &new_entries);

  /// Serializes Raft log into `std::string`.
@ -294,9 +328,6 @@ class RaftServer final : public RaftInterface {
  /// Deserializes Raft log from `std::string`.
  std::vector<LogEntry> DeserializeLog(const std::string &serialized_log);

-  void ResetReplicationLog() {
-    rlog_ = nullptr;
-    rlog_ = std::make_unique<ReplicationLog>();
-  }
+  void ResetReplicationLog();
 };
 }  // namespace raft