memgraph/src/coordination/raft_state.cpp
2024-03-21 09:12:28 +00:00

305 lines
11 KiB
C++

// Copyright 2024 Memgraph Ltd.
//
// Use of this software is governed by the Business Source License
// included in the file licenses/BSL.txt; by using this file, you agree to be bound by the terms of the Business Source
// License, and you may not use this file except in compliance with the Business Source License.
//
// As of the Change Date specified in that file, in accordance with
// the Business Source License, use of this software will be governed
// by the Apache License, Version 2.0, included in the file
// licenses/APL.txt.
#ifdef MG_ENTERPRISE
#include <chrono>
#include <spdlog/spdlog.h>
#include "coordination/coordinator_communication_config.hpp"
#include "coordination/coordinator_exceptions.hpp"
#include "coordination/raft_state.hpp"
#include "utils/counter.hpp"
namespace memgraph::coordination {
using nuraft::asio_service;
using nuraft::cb_func;
using nuraft::CbReturnCode;
using nuraft::cmd_result;
using nuraft::cs_new;
using nuraft::ptr;
using nuraft::raft_params;
using nuraft::raft_server;
using nuraft::srv_config;
using raft_result = cmd_result<ptr<buffer>>;
RaftState::RaftState(BecomeLeaderCb become_leader_cb, BecomeFollowerCb become_follower_cb, uint32_t coordinator_id,
uint32_t raft_port, std::string raft_address)
: raft_endpoint_(raft_address, raft_port),
coordinator_id_(coordinator_id),
state_machine_(cs_new<CoordinatorStateMachine>()),
state_manager_(cs_new<CoordinatorStateManager>(coordinator_id_, raft_endpoint_.SocketAddress())),
logger_(nullptr),
become_leader_cb_(std::move(become_leader_cb)),
become_follower_cb_(std::move(become_follower_cb)) {}
auto RaftState::InitRaftServer() -> void {
asio_service::options asio_opts;
asio_opts.thread_pool_size_ = 1;
raft_params params;
params.heart_beat_interval_ = 100;
params.election_timeout_lower_bound_ = 200;
params.election_timeout_upper_bound_ = 400;
params.reserved_log_items_ = 5;
params.snapshot_distance_ = 5;
params.client_req_timeout_ = 3000;
params.return_method_ = raft_params::blocking;
// If the leader doesn't receive any response from quorum nodes
// in 200ms, it will step down.
// This allows us to achieve strong consistency even if network partition
// happens between the current leader and followers.
// The value must be <= election_timeout_lower_bound_ so that cluster can never
// have multiple leaders.
params.leadership_expiry_ = 200;
raft_server::init_options init_opts;
init_opts.raft_callback_ = [this](cb_func::Type event_type, cb_func::Param *param) -> nuraft::CbReturnCode {
if (event_type == cb_func::BecomeLeader) {
spdlog::info("Node {} became leader", param->leaderId);
become_leader_cb_();
} else if (event_type == cb_func::BecomeFollower) {
spdlog::info("Node {} became follower", param->myId);
become_follower_cb_();
}
return CbReturnCode::Ok;
};
raft_launcher launcher;
raft_server_ =
launcher.init(state_machine_, state_manager_, logger_, raft_endpoint_.port, asio_opts, params, init_opts);
if (!raft_server_) {
throw RaftServerStartException("Failed to launch raft server on {}", raft_endpoint_.SocketAddress());
}
auto maybe_stop = utils::ResettableCounter<20>();
do {
if (raft_server_->is_initialized()) {
return;
}
std::this_thread::sleep_for(std::chrono::milliseconds(250));
} while (!maybe_stop());
throw RaftServerStartException("Failed to initialize raft server on {}", raft_endpoint_.SocketAddress());
}
auto RaftState::MakeRaftState(BecomeLeaderCb &&become_leader_cb, BecomeFollowerCb &&become_follower_cb) -> RaftState {
uint32_t coordinator_id = FLAGS_coordinator_id;
uint32_t raft_port = FLAGS_coordinator_port;
auto raft_state =
RaftState(std::move(become_leader_cb), std::move(become_follower_cb), coordinator_id, raft_port, "127.0.0.1");
raft_state.InitRaftServer();
return raft_state;
}
RaftState::~RaftState() { launcher_.shutdown(); }
auto RaftState::InstanceName() const -> std::string {
return fmt::format("coordinator_{}", std::to_string(coordinator_id_));
}
auto RaftState::RaftSocketAddress() const -> std::string { return raft_endpoint_.SocketAddress(); }
auto RaftState::AddCoordinatorInstance(coordination::CoordinatorToCoordinatorConfig const &config) -> void {
auto const endpoint = config.coordinator_server.SocketAddress();
srv_config const srv_config_to_add(static_cast<int>(config.coordinator_server_id), endpoint);
auto cmd_result = raft_server_->add_srv(srv_config_to_add);
if (cmd_result->get_result_code() == nuraft::cmd_result_code::OK) {
spdlog::info("Request to add server {} to the cluster accepted", endpoint);
} else {
throw RaftAddServerException("Failed to accept request to add server {} to the cluster with error code {}",
endpoint, int(cmd_result->get_result_code()));
}
// Waiting for server to join
constexpr int max_tries{10};
auto maybe_stop = utils::ResettableCounter<max_tries>();
constexpr int waiting_period{200};
bool added{false};
while (!maybe_stop()) {
std::this_thread::sleep_for(std::chrono::milliseconds(waiting_period));
const auto server_config = raft_server_->get_srv_config(static_cast<nuraft::int32>(config.coordinator_server_id));
if (server_config) {
spdlog::trace("Server with id {} added to cluster", config.coordinator_server_id);
added = true;
break;
}
}
if (!added) {
throw RaftAddServerException("Failed to add server {} to the cluster in {}ms", endpoint,
max_tries * waiting_period);
}
}
auto RaftState::GetAllCoordinators() const -> std::vector<ptr<srv_config>> {
std::vector<ptr<srv_config>> all_srv_configs;
raft_server_->get_srv_config_all(all_srv_configs);
return all_srv_configs;
}
auto RaftState::IsLeader() const -> bool { return raft_server_->is_leader(); }
auto RaftState::RequestLeadership() -> bool { return raft_server_->is_leader() || raft_server_->request_leadership(); }
auto RaftState::AppendRegisterReplicationInstanceLog(CoordinatorToReplicaConfig const &config) -> bool {
auto new_log = CoordinatorStateMachine::SerializeRegisterInstance(config);
auto const res = raft_server_->append_entries({new_log});
if (!res->get_accepted()) {
spdlog::error(
"Failed to accept request for registering instance {}. Most likely the reason is that the instance is not "
"the "
"leader.",
config.instance_name);
return false;
}
spdlog::info("Request for registering instance {} accepted", config.instance_name);
if (res->get_result_code() != nuraft::cmd_result_code::OK) {
spdlog::error("Failed to register instance {} with error code {}", config.instance_name,
int(res->get_result_code()));
return false;
}
return true;
}
auto RaftState::AppendUnregisterReplicationInstanceLog(std::string_view instance_name) -> bool {
auto new_log = CoordinatorStateMachine::SerializeUnregisterInstance(instance_name);
auto const res = raft_server_->append_entries({new_log});
if (!res->get_accepted()) {
spdlog::error(
"Failed to accept request for unregistering instance {}. Most likely the reason is that the instance is not "
"the leader.",
instance_name);
return false;
}
spdlog::info("Request for unregistering instance {} accepted", instance_name);
if (res->get_result_code() != nuraft::cmd_result_code::OK) {
spdlog::error("Failed to unregister instance {} with error code {}", instance_name, int(res->get_result_code()));
return false;
}
return true;
}
auto RaftState::AppendSetInstanceAsMainLog(std::string_view instance_name) -> bool {
auto new_log = CoordinatorStateMachine::SerializeSetInstanceAsMain(instance_name);
auto const res = raft_server_->append_entries({new_log});
if (!res->get_accepted()) {
spdlog::error(
"Failed to accept request for promoting instance {}. Most likely the reason is that the instance is not "
"the leader.",
instance_name);
return false;
}
spdlog::info("Request for promoting instance {} accepted", instance_name);
if (res->get_result_code() != nuraft::cmd_result_code::OK) {
spdlog::error("Failed to promote instance {} with error code {}", instance_name, int(res->get_result_code()));
return false;
}
return true;
}
auto RaftState::AppendSetInstanceAsReplicaLog(std::string_view instance_name) -> bool {
auto new_log = CoordinatorStateMachine::SerializeSetInstanceAsReplica(instance_name);
auto const res = raft_server_->append_entries({new_log});
if (!res->get_accepted()) {
spdlog::error(
"Failed to accept request for demoting instance {}. Most likely the reason is that the instance is not "
"the leader.",
instance_name);
return false;
}
spdlog::info("Request for demoting instance {} accepted", instance_name);
if (res->get_result_code() != nuraft::cmd_result_code::OK) {
spdlog::error("Failed to promote instance {} with error code {}", instance_name, int(res->get_result_code()));
return false;
}
return true;
}
auto RaftState::AppendUpdateUUIDLog(utils::UUID const &uuid) -> bool {
auto new_log = CoordinatorStateMachine::SerializeUpdateUUID(uuid);
auto const res = raft_server_->append_entries({new_log});
if (!res->get_accepted()) {
spdlog::error(
"Failed to accept request for updating UUID. Most likely the reason is that the instance is not "
"the leader.");
return false;
}
spdlog::info("Request for updating UUID accepted");
if (res->get_result_code() != nuraft::cmd_result_code::OK) {
spdlog::error("Failed to update UUID with error code {}", int(res->get_result_code()));
return false;
}
return true;
}
auto RaftState::AppendAddCoordinatorInstanceLog(CoordinatorToCoordinatorConfig const &config) -> bool {
auto new_log = CoordinatorStateMachine::SerializeAddCoordinatorInstance(config);
auto const res = raft_server_->append_entries({new_log});
if (!res->get_accepted()) {
spdlog::error(
"Failed to accept request for adding coordinator instance {}. Most likely the reason is that the instance is "
"not the leader.",
config.coordinator_server_id);
return false;
}
spdlog::info("Request for adding coordinator instance {} accepted", config.coordinator_server_id);
if (res->get_result_code() != nuraft::cmd_result_code::OK) {
spdlog::error("Failed to add coordinator instance {} with error code {}", config.coordinator_server_id,
static_cast<int>(res->get_result_code()));
return false;
}
return true;
}
auto RaftState::MainExists() const -> bool { return state_machine_->MainExists(); }
auto RaftState::IsMain(std::string_view instance_name) const -> bool { return state_machine_->IsMain(instance_name); }
auto RaftState::IsReplica(std::string_view instance_name) const -> bool {
return state_machine_->IsReplica(instance_name);
}
auto RaftState::GetReplicationInstances() const -> std::vector<ReplicationInstanceState> {
return state_machine_->GetReplicationInstances();
}
auto RaftState::GetCoordinatorInstances() const -> std::vector<CoordinatorInstanceState> {
return state_machine_->GetCoordinatorInstances();
}
auto RaftState::GetUUID() const -> utils::UUID { return state_machine_->GetUUID(); }
} // namespace memgraph::coordination
#endif