2018-11-19 18:27:45 +08:00
|
|
|
/// @file
|
|
|
|
|
|
|
|
#pragma once
|
|
|
|
|
2019-01-04 22:27:41 +08:00
|
|
|
#include <atomic>
|
2018-11-19 23:46:30 +08:00
|
|
|
#include <mutex>
|
|
|
|
#include <unordered_map>
|
|
|
|
#include <vector>
|
2018-11-19 18:27:45 +08:00
|
|
|
|
2018-12-12 22:50:17 +08:00
|
|
|
#include "database/single_node_ha/state_delta_applier.hpp"
|
2018-11-19 23:46:30 +08:00
|
|
|
#include "durability/single_node_ha/state_delta.hpp"
|
|
|
|
#include "raft/config.hpp"
|
2018-12-07 23:19:03 +08:00
|
|
|
#include "raft/coordination.hpp"
|
|
|
|
#include "raft/log_entry.hpp"
|
2018-11-30 21:32:32 +08:00
|
|
|
#include "raft/raft_interface.hpp"
|
2018-12-12 22:50:17 +08:00
|
|
|
#include "raft/raft_rpc_messages.hpp"
|
2018-12-18 21:31:55 +08:00
|
|
|
#include "raft/replication_log.hpp"
|
2018-11-19 18:27:45 +08:00
|
|
|
#include "storage/common/kvstore/kvstore.hpp"
|
2018-11-19 23:46:30 +08:00
|
|
|
#include "transactions/type.hpp"
|
2018-12-07 23:19:03 +08:00
|
|
|
#include "utils/scheduler.hpp"
|
2018-11-19 18:27:45 +08:00
|
|
|
|
|
|
|
namespace raft {
|
|
|
|
|
2018-12-11 17:51:37 +08:00
|
|
|
using Clock = std::chrono::system_clock;
|
2018-12-07 23:19:03 +08:00
|
|
|
using TimePoint = std::chrono::system_clock::time_point;
|
2018-11-21 22:49:04 +08:00
|
|
|
|
2018-11-19 18:27:45 +08:00
|
|
|
enum class Mode { FOLLOWER, CANDIDATE, LEADER };
|
|
|
|
|
2018-11-30 21:32:32 +08:00
|
|
|
inline std::string ModeToString(const Mode &mode) {
|
|
|
|
switch (mode) {
|
|
|
|
case Mode::FOLLOWER:
|
|
|
|
return "FOLLOWER";
|
|
|
|
case Mode::CANDIDATE:
|
|
|
|
return "CANDIDATE";
|
|
|
|
case Mode::LEADER:
|
|
|
|
return "LEADER";
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2018-11-19 23:46:30 +08:00
|
|
|
/// Class which models the behaviour of a single server within the Raft
|
|
|
|
/// cluster. The class is responsible for storing both volatile and
|
|
|
|
/// persistent internal state of the corresponding state machine as well
|
|
|
|
/// as performing operations that comply with the Raft protocol.
|
2018-11-30 21:32:32 +08:00
|
|
|
class RaftServer final : public RaftInterface {
|
2018-11-19 18:27:45 +08:00
|
|
|
public:
|
|
|
|
RaftServer() = delete;
|
|
|
|
|
2018-12-07 23:19:03 +08:00
|
|
|
/// The implementation assumes that server IDs are unique integers between
|
|
|
|
/// ranging from 1 to cluster_size.
|
|
|
|
///
|
|
|
|
/// @param server_id ID of the current server.
|
|
|
|
/// @param durbility_dir directory for persisted data.
|
|
|
|
/// @param config raft configuration.
|
|
|
|
/// @param coordination Abstraction for coordination between Raft servers.
|
2019-01-04 22:27:41 +08:00
|
|
|
/// @param delta_applier Object which is able to apply state deltas to SM.
|
2018-11-30 21:32:32 +08:00
|
|
|
/// @param reset_callback Function that is called on each Leader->Follower
|
|
|
|
/// transition.
|
2018-11-21 22:49:04 +08:00
|
|
|
RaftServer(uint16_t server_id, const std::string &durability_dir,
|
2018-11-30 21:32:32 +08:00
|
|
|
const Config &config, raft::Coordination *coordination,
|
2018-12-12 22:50:17 +08:00
|
|
|
database::StateDeltaApplier *delta_applier,
|
2019-01-03 21:53:03 +08:00
|
|
|
std::function<void(void)> reset_callback,
|
|
|
|
std::function<void(void)> no_op_create);
|
2018-11-19 18:27:45 +08:00
|
|
|
|
2018-12-11 17:51:37 +08:00
|
|
|
/// Starts the RPC servers and starts mechanisms inside Raft protocol.
|
|
|
|
void Start();
|
|
|
|
|
|
|
|
/// Stops all threads responsible for the Raft protocol.
|
|
|
|
void Shutdown();
|
2018-12-07 23:19:03 +08:00
|
|
|
|
|
|
|
/// Retrieves the current term from persistent storage.
|
|
|
|
///
|
|
|
|
/// @throws MissingPersistentDataException
|
|
|
|
uint64_t CurrentTerm();
|
|
|
|
|
|
|
|
/// Retrieves the ID of the server this server has voted for in
|
|
|
|
/// the current term from persistent storage. Returns std::nullopt
|
|
|
|
/// if such server doesn't exist.
|
|
|
|
std::experimental::optional<uint16_t> VotedFor();
|
|
|
|
|
2019-01-15 21:11:18 +08:00
|
|
|
/// Retrieves log size from persistent storage.
|
|
|
|
uint64_t LogSize();
|
2018-12-07 23:19:03 +08:00
|
|
|
|
2019-01-04 22:27:41 +08:00
|
|
|
/// Append to the log a list of batched state deltasa that are ready to be
|
2018-12-18 21:31:55 +08:00
|
|
|
/// replicated.
|
|
|
|
void AppendToLog(const tx::TransactionId &tx_id,
|
2019-01-04 22:27:41 +08:00
|
|
|
const std::vector<database::StateDelta> &deltas);
|
2018-11-19 18:27:45 +08:00
|
|
|
|
2018-11-30 21:32:32 +08:00
|
|
|
/// Emplace a single StateDelta to the corresponding batch. If the StateDelta
|
|
|
|
/// marks the transaction end, it will replicate the log accorss the cluster.
|
|
|
|
void Emplace(const database::StateDelta &delta) override;
|
2018-11-19 18:27:45 +08:00
|
|
|
|
2018-12-18 21:31:55 +08:00
|
|
|
/// Checks if the transaction with the given transaction id can safely be
|
|
|
|
/// committed in local storage.
|
|
|
|
bool SafeToCommit(const tx::TransactionId &tx_id) override;
|
|
|
|
|
2019-01-04 23:07:57 +08:00
|
|
|
/// Returns true if the current servers mode is LEADER. False otherwise.
|
|
|
|
bool IsLeader() override;
|
|
|
|
|
2018-12-18 21:31:55 +08:00
|
|
|
void GarbageCollectReplicationLog(const tx::TransactionId &tx_id);
|
2018-12-12 22:50:17 +08:00
|
|
|
|
2018-11-19 23:46:30 +08:00
|
|
|
private:
|
|
|
|
/// Buffers incomplete Raft logs.
|
|
|
|
///
|
|
|
|
/// A Raft log is considered to be complete if it ends with a StateDelta
|
2019-01-03 21:53:03 +08:00
|
|
|
/// that represents transaction commit.
|
2018-11-19 23:46:30 +08:00
|
|
|
/// LogEntryBuffer will be used instead of WriteAheadLog. We don't need to
|
|
|
|
/// persist logs until we receive a majority vote from the Raft cluster, and
|
|
|
|
/// apply the to our local state machine(storage).
|
|
|
|
class LogEntryBuffer final {
|
|
|
|
public:
|
|
|
|
LogEntryBuffer() = delete;
|
|
|
|
|
|
|
|
explicit LogEntryBuffer(RaftServer *raft_server);
|
|
|
|
|
|
|
|
void Enable();
|
|
|
|
|
|
|
|
/// Disable all future insertions in the buffer.
|
|
|
|
///
|
|
|
|
/// Note: this will also clear all existing logs from buffers.
|
|
|
|
void Disable();
|
|
|
|
|
|
|
|
/// Insert a new StateDelta in logs.
|
|
|
|
///
|
2018-12-12 22:50:17 +08:00
|
|
|
/// If the StateDelta type is `TRANSACTION_COMMIT` it will start
|
|
|
|
/// replicating, and if the type is `TRANSACTION_ABORT` it will delete the
|
|
|
|
/// log from buffer.
|
2018-11-19 23:46:30 +08:00
|
|
|
void Emplace(const database::StateDelta &delta);
|
|
|
|
|
|
|
|
private:
|
|
|
|
bool enabled_{false};
|
2019-01-04 22:27:41 +08:00
|
|
|
mutable std::mutex buffer_lock_;
|
2018-11-19 23:46:30 +08:00
|
|
|
std::unordered_map<tx::TransactionId, std::vector<database::StateDelta>>
|
|
|
|
logs_;
|
|
|
|
|
|
|
|
RaftServer *raft_server_{nullptr};
|
|
|
|
};
|
|
|
|
|
2018-12-11 17:51:37 +08:00
|
|
|
mutable std::mutex lock_; ///< Guards all internal state.
|
2018-12-07 23:19:03 +08:00
|
|
|
|
2018-11-19 23:46:30 +08:00
|
|
|
//////////////////////////////////////////////////////////////////////////////
|
|
|
|
// volatile state on all servers
|
|
|
|
//////////////////////////////////////////////////////////////////////////////
|
|
|
|
|
|
|
|
Config config_; ///< Raft config.
|
|
|
|
Coordination *coordination_{nullptr}; ///< Cluster coordination.
|
2018-12-12 22:50:17 +08:00
|
|
|
database::StateDeltaApplier *delta_applier_{nullptr};
|
2018-12-18 21:31:55 +08:00
|
|
|
std::unique_ptr<ReplicationLog> rlog_{nullptr};
|
2018-11-19 23:46:30 +08:00
|
|
|
|
2019-01-04 23:07:57 +08:00
|
|
|
std::atomic<Mode> mode_; ///< Server's current mode.
|
|
|
|
uint16_t server_id_; ///< ID of the current server.
|
|
|
|
uint64_t commit_index_; ///< Index of the highest known committed entry.
|
|
|
|
uint64_t last_applied_; ///< Index of the highest applied entry to SM.
|
2018-11-19 18:27:45 +08:00
|
|
|
|
2018-11-19 23:46:30 +08:00
|
|
|
/// Raft log entry buffer.
|
|
|
|
///
|
|
|
|
/// LogEntryBuffer buffers Raft logs until a log is complete and ready for
|
|
|
|
/// replication. This doesn't have to persist, if something fails before a
|
|
|
|
/// log is ready for replication it will be discarded anyway.
|
|
|
|
LogEntryBuffer log_entry_buffer_{this};
|
|
|
|
|
2018-12-11 17:51:37 +08:00
|
|
|
std::vector<std::thread> peer_threads_; ///< One thread per peer which
|
|
|
|
///< handles outgoing RPCs.
|
2018-12-07 23:19:03 +08:00
|
|
|
|
2018-12-11 17:51:37 +08:00
|
|
|
std::condition_variable state_changed_; ///< Notifies all peer threads on
|
|
|
|
///< relevant state change.
|
2018-12-07 23:19:03 +08:00
|
|
|
|
2019-01-04 23:07:57 +08:00
|
|
|
std::thread no_op_issuer_thread_; ///< Thread responsible for issuing no-op
|
|
|
|
///< command on leader change.
|
2019-01-04 22:27:41 +08:00
|
|
|
|
2019-01-04 23:07:57 +08:00
|
|
|
std::condition_variable leader_changed_; ///< Notifies the
|
|
|
|
///< no_op_issuer_thread that a new
|
|
|
|
///< leader has been elected.
|
2019-01-04 22:27:41 +08:00
|
|
|
|
2018-12-11 17:51:37 +08:00
|
|
|
bool exiting_ = false; ///< True on server shutdown.
|
2018-12-07 23:19:03 +08:00
|
|
|
|
|
|
|
//////////////////////////////////////////////////////////////////////////////
|
|
|
|
// volatile state on followers and candidates
|
|
|
|
//////////////////////////////////////////////////////////////////////////////
|
|
|
|
|
|
|
|
std::thread election_thread_; ///< Timer thread for triggering elections.
|
|
|
|
TimePoint next_election_; ///< Next election `TimePoint`.
|
|
|
|
|
2018-12-11 17:51:37 +08:00
|
|
|
std::condition_variable election_change_; ///> Used to notify election_thread
|
|
|
|
///> on next_election_ change.
|
2018-12-07 23:19:03 +08:00
|
|
|
|
|
|
|
std::mt19937_64 rng_ = std::mt19937_64(std::random_device{}());
|
|
|
|
|
|
|
|
//////////////////////////////////////////////////////////////////////////////
|
|
|
|
// volatile state on candidates
|
|
|
|
//////////////////////////////////////////////////////////////////////////////
|
|
|
|
|
|
|
|
uint16_t granted_votes_;
|
|
|
|
std::vector<bool> vote_requested_;
|
|
|
|
|
2018-11-19 23:46:30 +08:00
|
|
|
//////////////////////////////////////////////////////////////////////////////
|
|
|
|
// volatile state on leaders
|
|
|
|
//////////////////////////////////////////////////////////////////////////////
|
2018-11-19 18:27:45 +08:00
|
|
|
|
2019-01-04 22:27:41 +08:00
|
|
|
std::vector<uint64_t> next_index_; ///< for each server, index of the next
|
2018-11-19 18:27:45 +08:00
|
|
|
///< log entry to send to that server.
|
|
|
|
|
2019-01-04 22:27:41 +08:00
|
|
|
std::vector<uint64_t> match_index_; ///< for each server, index of the
|
2018-11-19 18:27:45 +08:00
|
|
|
///< highest log entry known to be
|
|
|
|
///< replicated on server.
|
|
|
|
|
2018-12-07 23:19:03 +08:00
|
|
|
std::vector<TimePoint> next_heartbeat_; ///< for each server, time point for
|
|
|
|
///< the next heartbeat.
|
|
|
|
std::vector<TimePoint> backoff_until_; ///< backoff for each server.
|
|
|
|
|
2018-11-19 23:46:30 +08:00
|
|
|
//////////////////////////////////////////////////////////////////////////////
|
|
|
|
// persistent state on all servers
|
|
|
|
//
|
|
|
|
// Persistent data consists of:
|
|
|
|
// - uint64_t current_term -- latest term server has seen.
|
|
|
|
// - uint16_t voted_for -- candidate_id that received vote in current
|
|
|
|
// term (null if none).
|
2019-01-15 21:11:18 +08:00
|
|
|
// - uint64_t log_size -- Number of stored entries within the log.
|
|
|
|
// - vector<LogEntry> log -- log entries. Each log entry is stored under
|
|
|
|
// a separate key within KVStore.
|
2018-11-19 23:46:30 +08:00
|
|
|
//////////////////////////////////////////////////////////////////////////////
|
2018-12-07 23:19:03 +08:00
|
|
|
|
2018-11-19 18:27:45 +08:00
|
|
|
storage::KVStore disk_storage_;
|
|
|
|
|
2018-11-30 21:32:32 +08:00
|
|
|
/// Callback that needs to be called to reset the db state.
|
|
|
|
std::function<void(void)> reset_callback_;
|
|
|
|
|
2019-01-03 21:53:03 +08:00
|
|
|
/// Callback that creates a new transaction with NO_OP StateDelta.
|
|
|
|
std::function<void(void)> no_op_create_callback_;
|
|
|
|
|
2018-11-19 23:46:30 +08:00
|
|
|
/// Makes a transition to a new `raft::Mode`.
|
|
|
|
///
|
2018-12-07 23:19:03 +08:00
|
|
|
/// throws InvalidTransitionException when transitioning between incompatible
|
|
|
|
/// `raft::Mode`s.
|
|
|
|
void Transition(const raft::Mode &new_mode);
|
|
|
|
|
2018-12-10 23:04:14 +08:00
|
|
|
/// Updates the current term.
|
|
|
|
void UpdateTerm(uint64_t new_term);
|
|
|
|
|
2019-01-04 22:27:41 +08:00
|
|
|
/// Tries to advance the commit index on a leader.
|
|
|
|
void AdvanceCommitIndex();
|
|
|
|
|
2018-12-07 23:19:03 +08:00
|
|
|
/// Recovers from persistent storage. This function should be called from
|
|
|
|
/// the constructor before the server starts with normal operation.
|
|
|
|
void Recover();
|
|
|
|
|
2019-01-04 22:27:41 +08:00
|
|
|
/// Sends Entries to peer. This function should only be called in leader
|
|
|
|
/// mode.
|
|
|
|
///
|
|
|
|
/// @param peer_id ID of the peer which receives entries.
|
|
|
|
/// @param lock Lock from the peer thread (released while waiting for
|
|
|
|
/// response)
|
|
|
|
void SendEntries(uint16_t peer_id, std::unique_lock<std::mutex> &lock);
|
|
|
|
|
2018-12-07 23:19:03 +08:00
|
|
|
/// Main function of the `election_thread_`. It is responsible for
|
|
|
|
/// transition to CANDIDATE mode when election timeout elapses.
|
|
|
|
void ElectionThreadMain();
|
|
|
|
|
|
|
|
/// Main function of the thread that handles outgoing RPCs towards a
|
|
|
|
/// specified node within the Raft cluster.
|
|
|
|
///
|
|
|
|
/// @param peer_id - ID of a receiving node in the cluster.
|
2019-01-04 22:27:41 +08:00
|
|
|
void PeerThreadMain(uint16_t peer_id);
|
|
|
|
|
|
|
|
/// Issues no-op command when a new leader is elected. This is done to
|
|
|
|
/// force the Raft protocol to commit logs from previous terms that
|
|
|
|
/// have been replicated on a majority of peers.
|
|
|
|
void NoOpIssuerThreadMain();
|
2018-12-07 23:19:03 +08:00
|
|
|
|
|
|
|
/// Sets the `TimePoint` for next election.
|
|
|
|
void SetNextElectionTimePoint();
|
|
|
|
|
|
|
|
/// Checks if the current server obtained enough votes to become a leader.
|
|
|
|
bool HasMajortyVote();
|
|
|
|
|
|
|
|
/// Returns relevant metadata about the last entry in this server's Raft Log.
|
|
|
|
/// More precisely, returns a pair consisting of an index of the last entry
|
|
|
|
/// in the log and the term of the last entry in the log.
|
|
|
|
///
|
|
|
|
/// @return std::pair<last_log_index, last_log_term>
|
|
|
|
std::pair<uint64_t, uint64_t> LastEntryData();
|
|
|
|
|
|
|
|
/// Checks whether Raft log of server A is at least as up-to-date as the Raft
|
|
|
|
/// log of server B. This is strictly defined in Raft paper 5.4.
|
|
|
|
///
|
|
|
|
/// @param last_log_index_a - Index of server A's last log entry.
|
|
|
|
/// @param last_log_term_a - Term of server A's last log entry.
|
|
|
|
/// @param last_log_index_b - Index of server B's last log entry.
|
|
|
|
/// @param last_log_term_b - Term of server B's last log entry.
|
|
|
|
bool AtLeastUpToDate(uint64_t last_log_index_a, uint64_t last_log_term_a,
|
|
|
|
uint64_t last_log_index_b, uint64_t last_log_term_b);
|
|
|
|
|
|
|
|
/// Checks whether the current server got a reply from "future", i.e. reply
|
|
|
|
/// with a higher term. If so, the current server falls back to follower mode
|
|
|
|
/// and updates its current term.
|
|
|
|
///
|
|
|
|
/// @param reply_term Term from RPC response.
|
|
|
|
/// @return true if the current server's term lags behind.
|
|
|
|
bool OutOfSync(uint64_t reply_term);
|
|
|
|
|
2019-01-15 21:11:18 +08:00
|
|
|
/// Retrieves a log entry from the log at a given index.
|
|
|
|
///
|
|
|
|
/// @param index Index of the log entry to be retrieved.
|
|
|
|
LogEntry GetLogEntry(int index);
|
|
|
|
|
2018-12-07 23:19:03 +08:00
|
|
|
/// Deletes log entries with indexes that are greater or equal to the given
|
|
|
|
/// starting index.
|
|
|
|
///
|
|
|
|
/// @param starting_index Smallest index which will be deleted from the Log.
|
|
|
|
/// Also, a friendly remainder that log entries are
|
|
|
|
/// 1-indexed.
|
|
|
|
void DeleteLogSuffix(int starting_index);
|
|
|
|
|
2019-01-04 22:27:41 +08:00
|
|
|
/// Stores log entries with indexes that are greater or equal to the given
|
|
|
|
/// starting index into a provided container. If the starting index is
|
|
|
|
/// greater than the log size, nothing will be stored in the provided
|
|
|
|
/// container.
|
|
|
|
///
|
|
|
|
/// @param starting_index Smallest index which will be stored.
|
|
|
|
/// @param entries The container which will store the wanted suffix.
|
|
|
|
void GetLogSuffix(int starting_index, std::vector<raft::LogEntry> &entries);
|
|
|
|
|
2018-12-07 23:19:03 +08:00
|
|
|
/// Appends new log entries to Raft log. Note that this function is not
|
|
|
|
/// smart in any way, i.e. the caller should make sure that it's safe
|
|
|
|
/// to call this function. This function also updates this server's commit
|
|
|
|
/// index if necessary.
|
|
|
|
///
|
|
|
|
/// @param leader_commit_index - Used to update local commit index.
|
2019-01-04 22:27:41 +08:00
|
|
|
/// @param starting_index - Index in the log from which we start to append.
|
2018-12-07 23:19:03 +08:00
|
|
|
/// @param new_entries - New `LogEntry` instances to be appended in the log.
|
2019-01-04 22:27:41 +08:00
|
|
|
void AppendLogEntries(uint64_t leader_commit_index, uint64_t starting_index,
|
2018-12-07 23:19:03 +08:00
|
|
|
const std::vector<LogEntry> &new_entries);
|
|
|
|
|
2019-01-15 21:11:18 +08:00
|
|
|
/// Generates the key under which the `LogEntry` with a given index should
|
|
|
|
/// be stored on our disk storage.
|
|
|
|
///
|
|
|
|
/// @param index - Index of the `LogEntry` for which we generate the key.
|
|
|
|
std::string LogEntryKey(uint64_t index);
|
2018-12-07 23:19:03 +08:00
|
|
|
|
2019-01-15 21:11:18 +08:00
|
|
|
/// Serializes Raft log entry into `std::string`
|
|
|
|
std::string SerializeLogEntry(const LogEntry &log_entry);
|
2018-12-18 21:31:55 +08:00
|
|
|
|
2019-01-15 21:11:18 +08:00
|
|
|
/// Deserialized Raft log entry from `std::string`
|
|
|
|
LogEntry DeserializeLogEntry(const std::string &serialized_log_entry);
|
2019-01-04 22:27:41 +08:00
|
|
|
void ResetReplicationLog();
|
2018-11-19 18:27:45 +08:00
|
|
|
};
|
|
|
|
} // namespace raft
|