memgraph/src/raft/raft_server.hpp

375 lines
15 KiB
C++
Raw Normal View History

/// @file
#pragma once
#include <atomic>
#include <mutex>
#include <unordered_map>
#include <vector>
#include "database/single_node_ha/state_delta_applier.hpp"
#include "durability/single_node_ha/state_delta.hpp"
#include "raft/config.hpp"
#include "raft/coordination.hpp"
#include "raft/log_entry.hpp"
#include "raft/raft_interface.hpp"
#include "raft/raft_rpc_messages.hpp"
#include "raft/replication_log.hpp"
#include "storage/common/kvstore/kvstore.hpp"
#include "transactions/type.hpp"
#include "utils/scheduler.hpp"
// Forward declaration
namespace database {
class GraphDb;
} // namespace database
namespace raft {
using Clock = std::chrono::system_clock;
using TimePoint = std::chrono::system_clock::time_point;
enum class Mode { FOLLOWER, CANDIDATE, LEADER };
inline std::string ModeToString(const Mode &mode) {
switch (mode) {
case Mode::FOLLOWER:
return "FOLLOWER";
case Mode::CANDIDATE:
return "CANDIDATE";
case Mode::LEADER:
return "LEADER";
}
}
/// Class which models the behaviour of a single server within the Raft
/// cluster. The class is responsible for storing both volatile and
/// persistent internal state of the corresponding state machine as well
/// as performing operations that comply with the Raft protocol.
class RaftServer final : public RaftInterface {
public:
RaftServer() = delete;
/// The implementation assumes that server IDs are unique integers between
/// ranging from 1 to cluster_size.
///
/// @param server_id ID of the current server.
/// @param durbility_dir directory for persisted data.
/// @param db_recover_on_startup flag indicating if recovery should happen at
/// startup.
/// @param config raft configuration.
/// @param coordination Abstraction for coordination between Raft servers.
/// @param delta_applier Object which is able to apply state deltas to SM.
/// @param db The current DB object.
RaftServer(uint16_t server_id, const std::string &durability_dir,
bool db_recover_on_startup, const Config &config,
raft::Coordination *coordination,
database::StateDeltaApplier *delta_applier, database::GraphDb *db);
/// Starts the RPC servers and starts mechanisms inside Raft protocol.
void Start();
/// Stops all threads responsible for the Raft protocol.
void Shutdown();
/// Retrieves the current term from persistent storage.
///
/// @throws MissingPersistentDataException
uint64_t CurrentTerm();
/// Retrieves the ID of the server this server has voted for in
/// the current term from persistent storage. Returns std::nullopt
/// if such server doesn't exist.
std::experimental::optional<uint16_t> VotedFor();
/// Retrieves log size from persistent storage.
uint64_t LogSize();
/// Retrieves persisted snapshot metadata or nullopt if not present.
std::experimental::optional<std::pair<uint64_t, uint64_t>>
GetSnapshotMetadata();
/// Persists snapshot metadata.
void PersistSnapshotMetadata(uint64_t last_included_term,
uint64_t last_included_index);
/// Append to the log a list of batched state deltasa that are ready to be
/// replicated.
void AppendToLog(const tx::TransactionId &tx_id,
const std::vector<database::StateDelta> &deltas);
/// Emplace a single StateDelta to the corresponding batch. If the StateDelta
/// marks the transaction end, it will replicate the log accorss the cluster.
void Emplace(const database::StateDelta &delta) override;
/// Checks if the transaction with the given transaction id can safely be
/// committed in local storage.
bool SafeToCommit(const tx::TransactionId &tx_id) override;
/// Returns true if the current servers mode is LEADER. False otherwise.
bool IsLeader() override;
void GarbageCollectReplicationLog(const tx::TransactionId &tx_id);
private:
/// Buffers incomplete Raft logs.
///
/// A Raft log is considered to be complete if it ends with a StateDelta
/// that represents transaction commit.
/// LogEntryBuffer will be used instead of WriteAheadLog. We don't need to
/// persist logs until we receive a majority vote from the Raft cluster, and
/// apply the to our local state machine(storage).
class LogEntryBuffer final {
public:
LogEntryBuffer() = delete;
explicit LogEntryBuffer(RaftServer *raft_server);
void Enable();
/// Disable all future insertions in the buffer.
///
/// Note: this will also clear all existing logs from buffers.
void Disable();
/// Insert a new StateDelta in logs.
///
/// If the StateDelta type is `TRANSACTION_COMMIT` it will start
/// replicating, and if the type is `TRANSACTION_ABORT` it will delete the
/// log from buffer.
void Emplace(const database::StateDelta &delta);
private:
bool enabled_{false};
mutable std::mutex buffer_lock_;
std::unordered_map<tx::TransactionId, std::vector<database::StateDelta>>
logs_;
RaftServer *raft_server_{nullptr};
};
mutable std::mutex lock_; ///< Guards all internal state.
//////////////////////////////////////////////////////////////////////////////
// volatile state on all servers
//////////////////////////////////////////////////////////////////////////////
Config config_; ///< Raft config.
Coordination *coordination_{nullptr}; ///< Cluster coordination.
database::StateDeltaApplier *delta_applier_{nullptr};
database::GraphDb *db_{nullptr};
std::unique_ptr<ReplicationLog> rlog_{nullptr};
std::atomic<Mode> mode_; ///< Server's current mode.
uint16_t server_id_; ///< ID of the current server.
std::string durability_dir_; ///< Durability directory.
bool db_recover_on_startup_; ///< Flag indicating if recovery should happen
///< on startup.
uint64_t commit_index_; ///< Index of the highest known committed entry.
uint64_t last_applied_; ///< Index of the highest applied entry to SM.
/// Raft log entry buffer.
///
/// LogEntryBuffer buffers Raft logs until a log is complete and ready for
/// replication. This doesn't have to persist, if something fails before a
/// log is ready for replication it will be discarded anyway.
LogEntryBuffer log_entry_buffer_{this};
std::vector<std::thread> peer_threads_; ///< One thread per peer which
///< handles outgoing RPCs.
std::condition_variable state_changed_; ///< Notifies all peer threads on
///< relevant state change.
std::thread no_op_issuer_thread_; ///< Thread responsible for issuing no-op
///< command on leader change.
std::thread snapshot_thread_; ///< Thread responsible for snapshot creation
///< when log size reaches
///< `log_size_snapshot_threshold`.
std::condition_variable leader_changed_; ///< Notifies the
///< no_op_issuer_thread that a new
///< leader has been elected.
bool exiting_ = false; ///< True on server shutdown.
//////////////////////////////////////////////////////////////////////////////
// volatile state on followers and candidates
//////////////////////////////////////////////////////////////////////////////
std::thread election_thread_; ///< Timer thread for triggering elections.
TimePoint next_election_; ///< Next election `TimePoint`.
std::condition_variable election_change_; ///> Used to notify election_thread
///> on next_election_ change.
std::mt19937_64 rng_ = std::mt19937_64(std::random_device{}());
//////////////////////////////////////////////////////////////////////////////
// volatile state on candidates
//////////////////////////////////////////////////////////////////////////////
uint16_t granted_votes_;
std::vector<bool> vote_requested_;
//////////////////////////////////////////////////////////////////////////////
// volatile state on leaders
//////////////////////////////////////////////////////////////////////////////
std::vector<uint64_t> next_index_; ///< for each server, index of the next
///< log entry to send to that server.
std::vector<uint64_t> match_index_; ///< for each server, index of the
///< highest log entry known to be
///< replicated on server.
std::vector<TimePoint> next_heartbeat_; ///< for each server, time point for
///< the next heartbeat.
std::vector<TimePoint> backoff_until_; ///< backoff for each server.
//////////////////////////////////////////////////////////////////////////////
// persistent state on all servers
//
// Persistent data consists of:
// - uint64_t current_term -- latest term server has seen.
// - uint16_t voted_for -- candidate_id that received vote in current
// term (null if none).
// - uint64_t log_size -- Number of stored entries within the log.
// - vector<LogEntry> log -- log entries. Each log entry is stored under
// a separate key within KVStore.
//////////////////////////////////////////////////////////////////////////////
storage::KVStore disk_storage_;
/// Makes a transition to a new `raft::Mode`.
///
/// throws InvalidTransitionException when transitioning between incompatible
/// `raft::Mode`s.
void Transition(const raft::Mode &new_mode);
/// Updates the current term.
void UpdateTerm(uint64_t new_term);
/// Tries to advance the commit index on a leader.
void AdvanceCommitIndex();
/// Recovers from persistent storage. This function should be called from
/// the constructor before the server starts with normal operation.
void Recover();
/// Sends Entries to peer. This function should only be called in leader
/// mode.
///
/// @param peer_id ID of the peer which receives entries.
/// @param lock Lock from the peer thread (released while waiting for
/// response)
void SendEntries(uint16_t peer_id, std::unique_lock<std::mutex> &lock);
/// Main function of the `election_thread_`. It is responsible for
/// transition to CANDIDATE mode when election timeout elapses.
void ElectionThreadMain();
/// Main function of the thread that handles outgoing RPCs towards a
/// specified node within the Raft cluster.
///
/// @param peer_id - ID of a receiving node in the cluster.
void PeerThreadMain(uint16_t peer_id);
/// Issues no-op command when a new leader is elected. This is done to
/// force the Raft protocol to commit logs from previous terms that
/// have been replicated on a majority of peers.
void NoOpIssuerThreadMain();
/// Periodically checks if the Log size reached `log_size_snapshot_threshold`
/// parameter. If it has, then it performs log compaction and creates
/// snapshots.
void SnapshotThread();
/// Sets the `TimePoint` for next election.
void SetNextElectionTimePoint();
/// Checks if the current server obtained enough votes to become a leader.
bool HasMajortyVote();
/// Returns relevant metadata about the last entry in this server's Raft Log.
/// More precisely, returns a pair consisting of an index of the last entry
/// in the log and the term of the last entry in the log.
///
/// @return std::pair<last_log_index, last_log_term>
std::pair<uint64_t, uint64_t> LastEntryData();
/// Checks whether Raft log of server A is at least as up-to-date as the Raft
/// log of server B. This is strictly defined in Raft paper 5.4.
///
/// @param last_log_index_a - Index of server A's last log entry.
/// @param last_log_term_a - Term of server A's last log entry.
/// @param last_log_index_b - Index of server B's last log entry.
/// @param last_log_term_b - Term of server B's last log entry.
bool AtLeastUpToDate(uint64_t last_log_index_a, uint64_t last_log_term_a,
uint64_t last_log_index_b, uint64_t last_log_term_b);
/// Checks whether the current server got a reply from "future", i.e. reply
/// with a higher term. If so, the current server falls back to follower mode
/// and updates its current term.
///
/// @param reply_term Term from RPC response.
/// @return true if the current server's term lags behind.
bool OutOfSync(uint64_t reply_term);
/// Retrieves a log entry from the log at a given index.
///
/// @param index Index of the log entry to be retrieved.
LogEntry GetLogEntry(int index);
/// Deletes log entries with indexes that are greater or equal to the given
/// starting index.
///
/// @param starting_index Smallest index which will be deleted from the Log.
/// Also, a friendly remainder that log entries are
/// 1-indexed.
void DeleteLogSuffix(int starting_index);
/// Stores log entries with indexes that are greater or equal to the given
/// starting index into a provided container. If the starting index is
/// greater than the log size, nothing will be stored in the provided
/// container.
///
/// @param starting_index Smallest index which will be stored.
/// @param entries The container which will store the wanted suffix.
void GetLogSuffix(int starting_index, std::vector<raft::LogEntry> &entries);
/// Appends new log entries to Raft log. Note that this function is not
/// smart in any way, i.e. the caller should make sure that it's safe
/// to call this function. This function also updates this server's commit
/// index if necessary.
///
/// @param leader_commit_index - Used to update local commit index.
/// @param starting_index - Index in the log from which we start to append.
/// @param new_entries - New `LogEntry` instances to be appended in the log.
void AppendLogEntries(uint64_t leader_commit_index, uint64_t starting_index,
const std::vector<LogEntry> &new_entries);
/// Generates the key under which the `LogEntry` with a given index should
/// be stored on our disk storage.
///
/// @param index - Index of the `LogEntry` for which we generate the key.
std::string LogEntryKey(uint64_t index);
/// Serializes Raft log entry into `std::string`
std::string SerializeLogEntry(const LogEntry &log_entry);
/// Deserialized Raft log entry from `std::string`
LogEntry DeserializeLogEntry(const std::string &serialized_log_entry);
/// Resets the replication log used to indicate the replication status.
void ResetReplicationLog();
/// Recovers the latest snapshot that exists in the durability directory.
void RecoverSnapshot();
/// Start a new transaction with a NO-OP StateDelta.
void NoOpCreate();
};
} // namespace raft