HA: Polish flow for replicas from coordinator (#1711)

This commit is contained in:
Antonio Filipovic 2024-02-16 10:58:01 +01:00 committed by GitHub
parent 5f2e3f01d0
commit bfc756c092
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
27 changed files with 534 additions and 100 deletions

View File

@ -15,6 +15,7 @@ target_sources(mg-coordination
include/coordination/instance_status.hpp include/coordination/instance_status.hpp
include/coordination/replication_instance.hpp include/coordination/replication_instance.hpp
include/coordination/raft_state.hpp include/coordination/raft_state.hpp
include/coordination/rpc_errors.hpp
include/nuraft/coordinator_log_store.hpp include/nuraft/coordinator_log_store.hpp
include/nuraft/coordinator_state_machine.hpp include/nuraft/coordinator_state_machine.hpp

View File

@ -17,6 +17,7 @@
#include "coordination/coordinator_config.hpp" #include "coordination/coordinator_config.hpp"
#include "coordination/coordinator_rpc.hpp" #include "coordination/coordinator_rpc.hpp"
#include "replication_coordination_glue/messages.hpp" #include "replication_coordination_glue/messages.hpp"
#include "utils/result.hpp"
namespace memgraph::coordination { namespace memgraph::coordination {
@ -45,6 +46,10 @@ auto CoordinatorClient::InstanceDownTimeoutSec() const -> std::chrono::seconds {
return config_.instance_down_timeout_sec; return config_.instance_down_timeout_sec;
} }
auto CoordinatorClient::InstanceGetUUIDFrequencySec() const -> std::chrono::seconds {
return config_.instance_get_uuid_frequency_sec;
}
void CoordinatorClient::StartFrequentCheck() { void CoordinatorClient::StartFrequentCheck() {
if (instance_checker_.IsRunning()) { if (instance_checker_.IsRunning()) {
return; return;
@ -140,6 +145,18 @@ auto CoordinatorClient::SendUnregisterReplicaRpc(std::string const &instance_nam
return false; return false;
} }
auto CoordinatorClient::SendGetInstanceUUIDRpc() const
-> utils::BasicResult<GetInstanceUUIDError, std::optional<utils::UUID>> {
try {
auto stream{rpc_client_.Stream<GetInstanceUUIDRpc>()};
auto res = stream.AwaitResponse();
return res.uuid;
} catch (const rpc::RpcFailedException &) {
spdlog::error("RPC error occured while sending GetInstance UUID RPC");
return GetInstanceUUIDError::RPC_EXCEPTION;
}
}
auto CoordinatorClient::SendEnableWritingOnMainRpc() const -> bool { auto CoordinatorClient::SendEnableWritingOnMainRpc() const -> bool {
try { try {
auto stream{rpc_client_.Stream<EnableWritingOnMainRpc>()}; auto stream{rpc_client_.Stream<EnableWritingOnMainRpc>()};

View File

@ -51,6 +51,12 @@ void CoordinatorHandlers::Register(memgraph::coordination::CoordinatorServer &se
spdlog::info("Received EnableWritingOnMainRpc on coordinator server"); spdlog::info("Received EnableWritingOnMainRpc on coordinator server");
CoordinatorHandlers::EnableWritingOnMainHandler(replication_handler, req_reader, res_builder); CoordinatorHandlers::EnableWritingOnMainHandler(replication_handler, req_reader, res_builder);
}); });
server.Register<coordination::GetInstanceUUIDRpc>(
[&replication_handler](slk::Reader *req_reader, slk::Builder *res_builder) -> void {
spdlog::info("Received GetInstanceUUIDRpc on coordinator server");
CoordinatorHandlers::GetInstanceUUIDHandler(replication_handler, req_reader, res_builder);
});
} }
void CoordinatorHandlers::SwapMainUUIDHandler(replication::ReplicationHandler &replication_handler, void CoordinatorHandlers::SwapMainUUIDHandler(replication::ReplicationHandler &replication_handler,
@ -74,12 +80,6 @@ void CoordinatorHandlers::DemoteMainToReplicaHandler(replication::ReplicationHan
slk::Reader *req_reader, slk::Builder *res_builder) { slk::Reader *req_reader, slk::Builder *res_builder) {
spdlog::info("Executing DemoteMainToReplicaHandler"); spdlog::info("Executing DemoteMainToReplicaHandler");
if (!replication_handler.IsMain()) {
spdlog::error("Setting to replica must be performed on main.");
slk::Save(coordination::DemoteMainToReplicaRes{false}, res_builder);
return;
}
coordination::DemoteMainToReplicaReq req; coordination::DemoteMainToReplicaReq req;
slk::Load(&req, req_reader); slk::Load(&req, req_reader);
@ -89,11 +89,18 @@ void CoordinatorHandlers::DemoteMainToReplicaHandler(replication::ReplicationHan
if (!replication_handler.SetReplicationRoleReplica(clients_config, std::nullopt)) { if (!replication_handler.SetReplicationRoleReplica(clients_config, std::nullopt)) {
spdlog::error("Demoting main to replica failed!"); spdlog::error("Demoting main to replica failed!");
slk::Save(coordination::PromoteReplicaToMainRes{false}, res_builder); slk::Save(coordination::DemoteMainToReplicaRes{false}, res_builder);
return; return;
} }
slk::Save(coordination::PromoteReplicaToMainRes{true}, res_builder); slk::Save(coordination::DemoteMainToReplicaRes{true}, res_builder);
}
void CoordinatorHandlers::GetInstanceUUIDHandler(replication::ReplicationHandler &replication_handler,
slk::Reader * /*req_reader*/, slk::Builder *res_builder) {
spdlog::info("Executing GetInstanceUUIDHandler");
slk::Save(coordination::GetInstanceUUIDRes{replication_handler.GetReplicaUUID()}, res_builder);
} }
void CoordinatorHandlers::PromoteReplicaToMainHandler(replication::ReplicationHandler &replication_handler, void CoordinatorHandlers::PromoteReplicaToMainHandler(replication::ReplicationHandler &replication_handler,

View File

@ -48,9 +48,12 @@ CoordinatorInstance::CoordinatorInstance()
spdlog::trace("Instance {} performing replica successful callback", repl_instance_name); spdlog::trace("Instance {} performing replica successful callback", repl_instance_name);
auto &repl_instance = find_repl_instance(self, repl_instance_name); auto &repl_instance = find_repl_instance(self, repl_instance_name);
// We need to get replicas UUID from time to time to ensure replica is listening to correct main
// and that it didn't go down for less time than we could notice
// We need to get id of main replica is listening to
// and swap if necessary
if (!repl_instance.EnsureReplicaHasCorrectMainUUID(self->GetMainUUID())) { if (!repl_instance.EnsureReplicaHasCorrectMainUUID(self->GetMainUUID())) {
spdlog::error( spdlog::error("Failed to swap uuid for replica instance {} which is alive", repl_instance.InstanceName());
fmt::format("Failed to swap uuid for replica instance {} which is alive", repl_instance.InstanceName()));
return; return;
} }
@ -62,14 +65,6 @@ CoordinatorInstance::CoordinatorInstance()
spdlog::trace("Instance {} performing replica failure callback", repl_instance_name); spdlog::trace("Instance {} performing replica failure callback", repl_instance_name);
auto &repl_instance = find_repl_instance(self, repl_instance_name); auto &repl_instance = find_repl_instance(self, repl_instance_name);
repl_instance.OnFailPing(); repl_instance.OnFailPing();
// We need to restart main uuid from instance since it was "down" at least a second
// There is slight delay, if we choose to use isAlive, instance can be down and back up in less than
// our isAlive time difference, which would lead to instance setting UUID to nullopt and stopping accepting any
// incoming RPCs from valid main
// TODO(antoniofilipovic) this needs here more complex logic
// We need to get id of main replica is listening to on successful ping
// and swap it to correct uuid if it failed
repl_instance.ResetMainUUID();
}; };
main_succ_cb_ = [find_repl_instance](CoordinatorInstance *self, std::string_view repl_instance_name) -> void { main_succ_cb_ = [find_repl_instance](CoordinatorInstance *self, std::string_view repl_instance_name) -> void {

View File

@ -80,6 +80,23 @@ void EnableWritingOnMainReq::Save(EnableWritingOnMainReq const &self, memgraph::
void EnableWritingOnMainReq::Load(EnableWritingOnMainReq *self, memgraph::slk::Reader *reader) {} void EnableWritingOnMainReq::Load(EnableWritingOnMainReq *self, memgraph::slk::Reader *reader) {}
// GetInstanceUUID
void GetInstanceUUIDReq::Save(const GetInstanceUUIDReq &self, memgraph::slk::Builder *builder) {
memgraph::slk::Save(self, builder);
}
void GetInstanceUUIDReq::Load(GetInstanceUUIDReq *self, memgraph::slk::Reader *reader) {
memgraph::slk::Load(self, reader);
}
void GetInstanceUUIDRes::Save(const GetInstanceUUIDRes &self, memgraph::slk::Builder *builder) {
memgraph::slk::Save(self, builder);
}
void GetInstanceUUIDRes::Load(GetInstanceUUIDRes *self, memgraph::slk::Reader *reader) {
memgraph::slk::Load(self, reader);
}
} // namespace coordination } // namespace coordination
constexpr utils::TypeInfo coordination::PromoteReplicaToMainReq::kType{utils::TypeId::COORD_FAILOVER_REQ, constexpr utils::TypeInfo coordination::PromoteReplicaToMainReq::kType{utils::TypeId::COORD_FAILOVER_REQ,
@ -107,8 +124,16 @@ constexpr utils::TypeInfo coordination::EnableWritingOnMainReq::kType{utils::Typ
constexpr utils::TypeInfo coordination::EnableWritingOnMainRes::kType{utils::TypeId::COORD_ENABLE_WRITING_ON_MAIN_RES, constexpr utils::TypeInfo coordination::EnableWritingOnMainRes::kType{utils::TypeId::COORD_ENABLE_WRITING_ON_MAIN_RES,
"CoordEnableWritingOnMainRes", nullptr}; "CoordEnableWritingOnMainRes", nullptr};
constexpr utils::TypeInfo coordination::GetInstanceUUIDReq::kType{utils::TypeId::COORD_GET_UUID_REQ, "CoordGetUUIDReq",
nullptr};
constexpr utils::TypeInfo coordination::GetInstanceUUIDRes::kType{utils::TypeId::COORD_GET_UUID_RES, "CoordGetUUIDRes",
nullptr};
namespace slk { namespace slk {
// PromoteReplicaToMainRpc
void Save(const memgraph::coordination::PromoteReplicaToMainRes &self, memgraph::slk::Builder *builder) { void Save(const memgraph::coordination::PromoteReplicaToMainRes &self, memgraph::slk::Builder *builder) {
memgraph::slk::Save(self.success, builder); memgraph::slk::Save(self.success, builder);
} }
@ -127,6 +152,7 @@ void Load(memgraph::coordination::PromoteReplicaToMainReq *self, memgraph::slk::
memgraph::slk::Load(&self->replication_clients_info, reader); memgraph::slk::Load(&self->replication_clients_info, reader);
} }
// DemoteMainToReplicaRpc
void Save(const memgraph::coordination::DemoteMainToReplicaReq &self, memgraph::slk::Builder *builder) { void Save(const memgraph::coordination::DemoteMainToReplicaReq &self, memgraph::slk::Builder *builder) {
memgraph::slk::Save(self.replication_client_info, builder); memgraph::slk::Save(self.replication_client_info, builder);
} }
@ -143,6 +169,8 @@ void Load(memgraph::coordination::DemoteMainToReplicaRes *self, memgraph::slk::R
memgraph::slk::Load(&self->success, reader); memgraph::slk::Load(&self->success, reader);
} }
// UnregisterReplicaRpc
void Save(memgraph::coordination::UnregisterReplicaReq const &self, memgraph::slk::Builder *builder) { void Save(memgraph::coordination::UnregisterReplicaReq const &self, memgraph::slk::Builder *builder) {
memgraph::slk::Save(self.instance_name, builder); memgraph::slk::Save(self.instance_name, builder);
} }
@ -167,6 +195,24 @@ void Load(memgraph::coordination::EnableWritingOnMainRes *self, memgraph::slk::R
memgraph::slk::Load(&self->success, reader); memgraph::slk::Load(&self->success, reader);
} }
// GetInstanceUUIDRpc
void Save(const memgraph::coordination::GetInstanceUUIDReq & /*self*/, memgraph::slk::Builder * /*builder*/) {
/* nothing to serialize*/
}
void Load(memgraph::coordination::GetInstanceUUIDReq * /*self*/, memgraph::slk::Reader * /*reader*/) {
/* nothing to serialize*/
}
void Save(const memgraph::coordination::GetInstanceUUIDRes &self, memgraph::slk::Builder *builder) {
memgraph::slk::Save(self.uuid, builder);
}
void Load(memgraph::coordination::GetInstanceUUIDRes *self, memgraph::slk::Reader *reader) {
memgraph::slk::Load(&self->uuid, reader);
}
} // namespace slk } // namespace slk
} // namespace memgraph } // namespace memgraph

View File

@ -15,6 +15,8 @@
#include "coordination/coordinator_config.hpp" #include "coordination/coordinator_config.hpp"
#include "rpc/client.hpp" #include "rpc/client.hpp"
#include "rpc_errors.hpp"
#include "utils/result.hpp"
#include "utils/scheduler.hpp" #include "utils/scheduler.hpp"
#include "utils/uuid.hpp" #include "utils/uuid.hpp"
@ -46,7 +48,7 @@ class CoordinatorClient {
auto SocketAddress() const -> std::string; auto SocketAddress() const -> std::string;
[[nodiscard]] auto DemoteToReplica() const -> bool; [[nodiscard]] auto DemoteToReplica() const -> bool;
// TODO: (andi) Consistent naming
auto SendPromoteReplicaToMainRpc(const utils::UUID &uuid, ReplicationClientsInfo replication_clients_info) const auto SendPromoteReplicaToMainRpc(const utils::UUID &uuid, ReplicationClientsInfo replication_clients_info) const
-> bool; -> bool;
@ -56,6 +58,8 @@ class CoordinatorClient {
auto SendEnableWritingOnMainRpc() const -> bool; auto SendEnableWritingOnMainRpc() const -> bool;
auto SendGetInstanceUUIDRpc() const -> memgraph::utils::BasicResult<GetInstanceUUIDError, std::optional<utils::UUID>>;
auto ReplicationClientInfo() const -> ReplClientInfo; auto ReplicationClientInfo() const -> ReplClientInfo;
auto SetCallbacks(HealthCheckCallback succ_cb, HealthCheckCallback fail_cb) -> void; auto SetCallbacks(HealthCheckCallback succ_cb, HealthCheckCallback fail_cb) -> void;
@ -64,6 +68,8 @@ class CoordinatorClient {
auto InstanceDownTimeoutSec() const -> std::chrono::seconds; auto InstanceDownTimeoutSec() const -> std::chrono::seconds;
auto InstanceGetUUIDFrequencySec() const -> std::chrono::seconds;
friend bool operator==(CoordinatorClient const &first, CoordinatorClient const &second) { friend bool operator==(CoordinatorClient const &first, CoordinatorClient const &second) {
return first.config_ == second.config_; return first.config_ == second.config_;
} }
@ -71,7 +77,6 @@ class CoordinatorClient {
private: private:
utils::Scheduler instance_checker_; utils::Scheduler instance_checker_;
// TODO: (andi) Pimpl?
communication::ClientContext rpc_context_; communication::ClientContext rpc_context_;
mutable rpc::Client rpc_client_; mutable rpc::Client rpc_client_;

View File

@ -30,6 +30,7 @@ struct CoordinatorClientConfig {
uint16_t port{}; uint16_t port{};
std::chrono::seconds instance_health_check_frequency_sec{1}; std::chrono::seconds instance_health_check_frequency_sec{1};
std::chrono::seconds instance_down_timeout_sec{5}; std::chrono::seconds instance_down_timeout_sec{5};
std::chrono::seconds instance_get_uuid_frequency_sec{10};
auto SocketAddress() const -> std::string { return ip_address + ":" + std::to_string(port); } auto SocketAddress() const -> std::string { return ip_address + ":" + std::to_string(port); }

View File

@ -38,6 +38,9 @@ class CoordinatorHandlers {
slk::Builder *res_builder); slk::Builder *res_builder);
static void EnableWritingOnMainHandler(replication::ReplicationHandler &replication_handler, slk::Reader *req_reader, static void EnableWritingOnMainHandler(replication::ReplicationHandler &replication_handler, slk::Reader *req_reader,
slk::Builder *res_builder); slk::Builder *res_builder);
static void GetInstanceUUIDHandler(replication::ReplicationHandler &replication_handler, slk::Reader *req_reader,
slk::Builder *res_builder);
}; };
} // namespace memgraph::dbms } // namespace memgraph::dbms

View File

@ -136,6 +136,31 @@ struct EnableWritingOnMainRes {
using EnableWritingOnMainRpc = rpc::RequestResponse<EnableWritingOnMainReq, EnableWritingOnMainRes>; using EnableWritingOnMainRpc = rpc::RequestResponse<EnableWritingOnMainReq, EnableWritingOnMainRes>;
struct GetInstanceUUIDReq {
static const utils::TypeInfo kType;
static const utils::TypeInfo &GetTypeInfo() { return kType; }
static void Load(GetInstanceUUIDReq *self, memgraph::slk::Reader *reader);
static void Save(const GetInstanceUUIDReq &self, memgraph::slk::Builder *builder);
GetInstanceUUIDReq() = default;
};
struct GetInstanceUUIDRes {
static const utils::TypeInfo kType;
static const utils::TypeInfo &GetTypeInfo() { return kType; }
static void Load(GetInstanceUUIDRes *self, memgraph::slk::Reader *reader);
static void Save(const GetInstanceUUIDRes &self, memgraph::slk::Builder *builder);
explicit GetInstanceUUIDRes(std::optional<utils::UUID> uuid) : uuid(uuid) {}
GetInstanceUUIDRes() = default;
std::optional<utils::UUID> uuid;
};
using GetInstanceUUIDRpc = rpc::RequestResponse<GetInstanceUUIDReq, GetInstanceUUIDRes>;
} // namespace memgraph::coordination } // namespace memgraph::coordination
// SLK serialization declarations // SLK serialization declarations
@ -153,6 +178,11 @@ void Load(memgraph::coordination::DemoteMainToReplicaRes *self, memgraph::slk::R
void Save(const memgraph::coordination::DemoteMainToReplicaReq &self, memgraph::slk::Builder *builder); void Save(const memgraph::coordination::DemoteMainToReplicaReq &self, memgraph::slk::Builder *builder);
void Load(memgraph::coordination::DemoteMainToReplicaReq *self, memgraph::slk::Reader *reader); void Load(memgraph::coordination::DemoteMainToReplicaReq *self, memgraph::slk::Reader *reader);
// GetInstanceUUIDRpc
void Save(const memgraph::coordination::GetInstanceUUIDReq &self, memgraph::slk::Builder *builder);
void Load(memgraph::coordination::GetInstanceUUIDReq *self, memgraph::slk::Reader *reader);
void Save(const memgraph::coordination::GetInstanceUUIDRes &self, memgraph::slk::Builder *builder);
void Load(memgraph::coordination::GetInstanceUUIDRes *self, memgraph::slk::Reader *reader);
// UnregisterReplicaRpc // UnregisterReplicaRpc
void Save(memgraph::coordination::UnregisterReplicaRes const &self, memgraph::slk::Builder *builder); void Save(memgraph::coordination::UnregisterReplicaRes const &self, memgraph::slk::Builder *builder);
void Load(memgraph::coordination::UnregisterReplicaRes *self, memgraph::slk::Reader *reader); void Load(memgraph::coordination::UnregisterReplicaRes *self, memgraph::slk::Reader *reader);

View File

@ -18,6 +18,7 @@
#include "replication_coordination_glue/role.hpp" #include "replication_coordination_glue/role.hpp"
#include <libnuraft/nuraft.hxx> #include <libnuraft/nuraft.hxx>
#include "utils/result.hpp"
#include "utils/uuid.hpp" #include "utils/uuid.hpp"
namespace memgraph::coordination { namespace memgraph::coordination {
@ -37,6 +38,9 @@ class ReplicationInstance {
auto OnSuccessPing() -> void; auto OnSuccessPing() -> void;
auto OnFailPing() -> bool; auto OnFailPing() -> bool;
auto IsReadyForUUIDPing() -> bool;
void UpdateReplicaLastResponseUUID();
auto IsAlive() const -> bool; auto IsAlive() const -> bool;
@ -62,7 +66,8 @@ class ReplicationInstance {
auto SendSwapAndUpdateUUID(const utils::UUID &new_main_uuid) -> bool; auto SendSwapAndUpdateUUID(const utils::UUID &new_main_uuid) -> bool;
auto SendUnregisterReplicaRpc(std::string const &instance_name) -> bool; auto SendUnregisterReplicaRpc(std::string const &instance_name) -> bool;
// TODO: (andi) Inconsistent API
auto SendGetInstanceUUID() -> utils::BasicResult<coordination::GetInstanceUUIDError, std::optional<utils::UUID>>;
auto GetClient() -> CoordinatorClient &; auto GetClient() -> CoordinatorClient &;
auto EnableWritingOnMain() -> bool; auto EnableWritingOnMain() -> bool;
@ -76,6 +81,7 @@ class ReplicationInstance {
replication_coordination_glue::ReplicationRole replication_role_; replication_coordination_glue::ReplicationRole replication_role_;
std::chrono::system_clock::time_point last_response_time_{}; std::chrono::system_clock::time_point last_response_time_{};
bool is_alive_{false}; bool is_alive_{false};
std::chrono::system_clock::time_point last_check_of_uuid_{};
// for replica this is main uuid of current main // for replica this is main uuid of current main
// for "main" main this same as in CoordinatorData // for "main" main this same as in CoordinatorData

View File

@ -0,0 +1,14 @@
// Copyright 2024 Memgraph Ltd.
//
// Use of this software is governed by the Business Source License
// included in the file licenses/BSL.txt; by using this file, you agree to be bound by the terms of the Business Source
// License, and you may not use this file except in compliance with the Business Source License.
//
// As of the Change Date specified in that file, in accordance with
// the Business Source License, use of this software will be governed
// by the Apache License, Version 2.0, included in the file
// licenses/APL.txt.
namespace memgraph::coordination {
enum class GetInstanceUUIDError { NO_RESPONSE, RPC_EXCEPTION };
} // namespace memgraph::coordination

View File

@ -14,6 +14,7 @@
#include "coordination/replication_instance.hpp" #include "coordination/replication_instance.hpp"
#include "replication_coordination_glue/handler.hpp" #include "replication_coordination_glue/handler.hpp"
#include "utils/result.hpp"
namespace memgraph::coordination { namespace memgraph::coordination {
@ -39,6 +40,11 @@ auto ReplicationInstance::OnFailPing() -> bool {
return is_alive_; return is_alive_;
} }
auto ReplicationInstance::IsReadyForUUIDPing() -> bool {
return std::chrono::duration_cast<std::chrono::seconds>(std::chrono::system_clock::now() - last_check_of_uuid_) >
client_.InstanceGetUUIDFrequencySec();
}
auto ReplicationInstance::InstanceName() const -> std::string { return client_.InstanceName(); } auto ReplicationInstance::InstanceName() const -> std::string { return client_.InstanceName(); }
auto ReplicationInstance::SocketAddress() const -> std::string { return client_.SocketAddress(); } auto ReplicationInstance::SocketAddress() const -> std::string { return client_.SocketAddress(); }
auto ReplicationInstance::IsAlive() const -> bool { return is_alive_; } auto ReplicationInstance::IsAlive() const -> bool { return is_alive_; }
@ -91,11 +97,21 @@ auto ReplicationInstance::ResetMainUUID() -> void { main_uuid_ = std::nullopt; }
auto ReplicationInstance::GetMainUUID() const -> std::optional<utils::UUID> const & { return main_uuid_; } auto ReplicationInstance::GetMainUUID() const -> std::optional<utils::UUID> const & { return main_uuid_; }
auto ReplicationInstance::EnsureReplicaHasCorrectMainUUID(utils::UUID const &curr_main_uuid) -> bool { auto ReplicationInstance::EnsureReplicaHasCorrectMainUUID(utils::UUID const &curr_main_uuid) -> bool {
if (!main_uuid_ || *main_uuid_ != curr_main_uuid) { if (!IsReadyForUUIDPing()) {
return SendSwapAndUpdateUUID(curr_main_uuid);
}
return true; return true;
} }
auto res = SendGetInstanceUUID();
if (res.HasError()) {
return false;
}
UpdateReplicaLastResponseUUID();
if (res.GetValue().has_value() && res.GetValue().value() == curr_main_uuid) {
return true;
}
return SendSwapAndUpdateUUID(curr_main_uuid);
}
auto ReplicationInstance::SendSwapAndUpdateUUID(const utils::UUID &new_main_uuid) -> bool { auto ReplicationInstance::SendSwapAndUpdateUUID(const utils::UUID &new_main_uuid) -> bool {
if (!replication_coordination_glue::SendSwapMainUUIDRpc(client_.RpcClient(), new_main_uuid)) { if (!replication_coordination_glue::SendSwapMainUUIDRpc(client_.RpcClient(), new_main_uuid)) {
@ -111,5 +127,12 @@ auto ReplicationInstance::SendUnregisterReplicaRpc(std::string const &instance_n
auto ReplicationInstance::EnableWritingOnMain() -> bool { return client_.SendEnableWritingOnMainRpc(); } auto ReplicationInstance::EnableWritingOnMain() -> bool { return client_.SendEnableWritingOnMainRpc(); }
auto ReplicationInstance::SendGetInstanceUUID()
-> utils::BasicResult<coordination::GetInstanceUUIDError, std::optional<utils::UUID>> {
return client_.SendGetInstanceUUIDRpc();
}
void ReplicationInstance::UpdateReplicaLastResponseUUID() { last_check_of_uuid_ = std::chrono::system_clock::now(); }
} // namespace memgraph::coordination } // namespace memgraph::coordination
#endif #endif

View File

@ -22,6 +22,8 @@ DEFINE_uint32(raft_server_id, 0, "Unique ID of the raft server.");
DEFINE_uint32(instance_down_timeout_sec, 5, "Time duration after which an instance is considered down."); DEFINE_uint32(instance_down_timeout_sec, 5, "Time duration after which an instance is considered down.");
// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables) // NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
DEFINE_uint32(instance_health_check_frequency_sec, 1, "The time duration between two health checks/pings."); DEFINE_uint32(instance_health_check_frequency_sec, 1, "The time duration between two health checks/pings.");
// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
DEFINE_uint32(instance_get_uuid_frequency_sec, 10, "The time duration between two instance uuid checks.");
#endif #endif
// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables) // NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)

View File

@ -24,6 +24,8 @@ DECLARE_uint32(raft_server_id);
DECLARE_uint32(instance_down_timeout_sec); DECLARE_uint32(instance_down_timeout_sec);
// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables) // NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
DECLARE_uint32(instance_health_check_frequency_sec); DECLARE_uint32(instance_health_check_frequency_sec);
// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
DECLARE_uint32(instance_get_uuid_frequency_sec);
#endif #endif
// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables) // NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)

View File

@ -359,6 +359,7 @@ int main(int argc, char **argv) {
#ifdef MG_ENTERPRISE #ifdef MG_ENTERPRISE
.instance_down_timeout_sec = std::chrono::seconds(FLAGS_instance_down_timeout_sec), .instance_down_timeout_sec = std::chrono::seconds(FLAGS_instance_down_timeout_sec),
.instance_health_check_frequency_sec = std::chrono::seconds(FLAGS_instance_health_check_frequency_sec), .instance_health_check_frequency_sec = std::chrono::seconds(FLAGS_instance_health_check_frequency_sec),
.instance_get_uuid_frequency_sec = std::chrono::seconds(FLAGS_instance_get_uuid_frequency_sec),
#endif #endif
.default_kafka_bootstrap_servers = FLAGS_kafka_bootstrap_servers, .default_kafka_bootstrap_servers = FLAGS_kafka_bootstrap_servers,
.default_pulsar_service_url = FLAGS_pulsar_service_url, .default_pulsar_service_url = FLAGS_pulsar_service_url,

View File

@ -24,6 +24,7 @@ struct InterpreterConfig {
std::chrono::seconds instance_down_timeout_sec{5}; std::chrono::seconds instance_down_timeout_sec{5};
std::chrono::seconds instance_health_check_frequency_sec{1}; std::chrono::seconds instance_health_check_frequency_sec{1};
std::chrono::seconds instance_get_uuid_frequency_sec{10};
std::string default_kafka_bootstrap_servers; std::string default_kafka_bootstrap_servers;
std::string default_pulsar_service_url; std::string default_pulsar_service_url;

View File

@ -329,7 +329,7 @@ class ReplQueryHandler {
.port = static_cast<uint16_t>(*port), .port = static_cast<uint16_t>(*port),
}; };
if (!handler_->SetReplicationRoleReplica(config, std::nullopt)) { if (!handler_->TrySetReplicationRoleReplica(config, std::nullopt)) {
throw QueryRuntimeException("Couldn't set role to replica!"); throw QueryRuntimeException("Couldn't set role to replica!");
} }
} }
@ -486,8 +486,9 @@ class CoordQueryHandler final : public query::CoordinatorQueryHandler {
void RegisterReplicationInstance(std::string const &coordinator_socket_address, void RegisterReplicationInstance(std::string const &coordinator_socket_address,
std::string const &replication_socket_address, std::string const &replication_socket_address,
std::chrono::seconds const &instance_check_frequency, std::chrono::seconds const &instance_check_frequency,
std::chrono::seconds const &instance_down_timeout, std::string const &instance_name, std::chrono::seconds const &instance_down_timeout,
CoordinatorQuery::SyncMode sync_mode) override { std::chrono::seconds const &instance_get_uuid_frequency,
std::string const &instance_name, CoordinatorQuery::SyncMode sync_mode) override {
const auto maybe_replication_ip_port = const auto maybe_replication_ip_port =
io::network::Endpoint::ParseSocketOrAddress(replication_socket_address, std::nullopt); io::network::Endpoint::ParseSocketOrAddress(replication_socket_address, std::nullopt);
if (!maybe_replication_ip_port) { if (!maybe_replication_ip_port) {
@ -514,6 +515,7 @@ class CoordQueryHandler final : public query::CoordinatorQueryHandler {
.port = coordinator_server_port, .port = coordinator_server_port,
.instance_health_check_frequency_sec = instance_check_frequency, .instance_health_check_frequency_sec = instance_check_frequency,
.instance_down_timeout_sec = instance_down_timeout, .instance_down_timeout_sec = instance_down_timeout,
.instance_get_uuid_frequency_sec = instance_get_uuid_frequency,
.replication_client_info = repl_config, .replication_client_info = repl_config,
.ssl = std::nullopt}; .ssl = std::nullopt};
@ -1185,11 +1187,12 @@ Callback HandleCoordinatorQuery(CoordinatorQuery *coordinator_query, const Param
instance_health_check_frequency_sec = config.instance_health_check_frequency_sec, instance_health_check_frequency_sec = config.instance_health_check_frequency_sec,
instance_name = coordinator_query->instance_name_, instance_name = coordinator_query->instance_name_,
instance_down_timeout_sec = config.instance_down_timeout_sec, instance_down_timeout_sec = config.instance_down_timeout_sec,
instance_get_uuid_frequency_sec = config.instance_get_uuid_frequency_sec,
sync_mode = coordinator_query->sync_mode_]() mutable { sync_mode = coordinator_query->sync_mode_]() mutable {
handler.RegisterReplicationInstance(std::string(coordinator_socket_address_tv.ValueString()), handler.RegisterReplicationInstance(std::string(coordinator_socket_address_tv.ValueString()),
std::string(replication_socket_address_tv.ValueString()), std::string(replication_socket_address_tv.ValueString()),
instance_health_check_frequency_sec, instance_down_timeout_sec, instance_health_check_frequency_sec, instance_down_timeout_sec,
instance_name, sync_mode); instance_get_uuid_frequency_sec, instance_name, sync_mode);
return std::vector<std::vector<TypedValue>>(); return std::vector<std::vector<TypedValue>>();
}; };

View File

@ -109,6 +109,7 @@ class CoordinatorQueryHandler {
std::string const &replication_socket_address, std::string const &replication_socket_address,
std::chrono::seconds const &instance_health_check_frequency, std::chrono::seconds const &instance_health_check_frequency,
std::chrono::seconds const &instance_down_timeout, std::chrono::seconds const &instance_down_timeout,
std::chrono::seconds const &instance_get_uuid_frequency,
std::string const &instance_name, CoordinatorQuery::SyncMode sync_mode) = 0; std::string const &instance_name, CoordinatorQuery::SyncMode sync_mode) = 0;
/// @throw QueryRuntimeException if an error ocurred. /// @throw QueryRuntimeException if an error ocurred.

View File

@ -49,6 +49,9 @@ struct ReplicationQueryHandler {
virtual bool SetReplicationRoleReplica(const memgraph::replication::ReplicationServerConfig &config, virtual bool SetReplicationRoleReplica(const memgraph::replication::ReplicationServerConfig &config,
const std::optional<utils::UUID> &main_uuid) = 0; const std::optional<utils::UUID> &main_uuid) = 0;
virtual bool TrySetReplicationRoleReplica(const memgraph::replication::ReplicationServerConfig &config,
const std::optional<utils::UUID> &main_uuid) = 0;
// as MAIN, define and connect to REPLICAs // as MAIN, define and connect to REPLICAs
virtual auto TryRegisterReplica(const memgraph::replication::ReplicationClientConfig &config, bool send_swap_uuid) virtual auto TryRegisterReplica(const memgraph::replication::ReplicationClientConfig &config, bool send_swap_uuid)
-> utils::BasicResult<RegisterReplicaError> = 0; -> utils::BasicResult<RegisterReplicaError> = 0;

View File

@ -62,8 +62,9 @@ ReplicationState::ReplicationState(std::optional<std::filesystem::path> durabili
} }
#endif #endif
if (std::holds_alternative<RoleReplicaData>(replication_data)) { if (std::holds_alternative<RoleReplicaData>(replication_data)) {
spdlog::trace("Recovered main's uuid for replica {}", auto &replica_uuid = std::get<RoleReplicaData>(replication_data).uuid_;
std::string(std::get<RoleReplicaData>(replication_data).uuid_.value())); std::string uuid = replica_uuid.has_value() ? std::string(replica_uuid.value()) : "";
spdlog::trace("Recovered main's uuid for replica {}", uuid);
} else { } else {
spdlog::trace("Recovered uuid for main {}", std::string(std::get<RoleMainData>(replication_data).uuid_)); spdlog::trace("Recovered uuid for main {}", std::string(std::get<RoleMainData>(replication_data).uuid_));
} }

View File

@ -14,6 +14,7 @@
#include "dbms/dbms_handler.hpp" #include "dbms/dbms_handler.hpp"
#include "flags/experimental.hpp" #include "flags/experimental.hpp"
#include "replication/include/replication/state.hpp" #include "replication/include/replication/state.hpp"
#include "replication_handler/system_replication.hpp"
#include "replication_handler/system_rpc.hpp" #include "replication_handler/system_rpc.hpp"
#include "utils/result.hpp" #include "utils/result.hpp"
@ -113,10 +114,14 @@ struct ReplicationHandler : public memgraph::query::ReplicationQueryHandler {
// as REPLICA, become MAIN // as REPLICA, become MAIN
bool SetReplicationRoleMain() override; bool SetReplicationRoleMain() override;
// as MAIN, become REPLICA // as MAIN, become REPLICA, can be called on MAIN and REPLICA
bool SetReplicationRoleReplica(const memgraph::replication::ReplicationServerConfig &config, bool SetReplicationRoleReplica(const memgraph::replication::ReplicationServerConfig &config,
const std::optional<utils::UUID> &main_uuid) override; const std::optional<utils::UUID> &main_uuid) override;
// as MAIN, become REPLICA, can be called only on MAIN
bool TrySetReplicationRoleReplica(const memgraph::replication::ReplicationServerConfig &config,
const std::optional<utils::UUID> &main_uuid) override;
// as MAIN, define and connect to REPLICAs // as MAIN, define and connect to REPLICAs
auto TryRegisterReplica(const memgraph::replication::ReplicationClientConfig &config, bool send_swap_uuid) auto TryRegisterReplica(const memgraph::replication::ReplicationClientConfig &config, bool send_swap_uuid)
-> memgraph::utils::BasicResult<memgraph::query::RegisterReplicaError> override; -> memgraph::utils::BasicResult<memgraph::query::RegisterReplicaError> override;
@ -137,12 +142,13 @@ struct ReplicationHandler : public memgraph::query::ReplicationQueryHandler {
auto GetReplState() const -> const memgraph::replication::ReplicationState &; auto GetReplState() const -> const memgraph::replication::ReplicationState &;
auto GetReplState() -> memgraph::replication::ReplicationState &; auto GetReplState() -> memgraph::replication::ReplicationState &;
auto GetReplicaUUID() -> std::optional<utils::UUID>;
private: private:
template <bool AllowReplicaToDivergeFromMain> template <bool AllowRPCFailure>
auto RegisterReplica_(const memgraph::replication::ReplicationClientConfig &config, bool send_swap_uuid) auto RegisterReplica_(const memgraph::replication::ReplicationClientConfig &config, bool send_swap_uuid)
-> memgraph::utils::BasicResult<memgraph::query::RegisterReplicaError> { -> memgraph::utils::BasicResult<memgraph::query::RegisterReplicaError> {
MG_ASSERT(repl_state_.IsMain(), "Only main instance can register a replica!"); MG_ASSERT(repl_state_.IsMain(), "Only main instance can register a replica!");
auto maybe_client = repl_state_.RegisterReplica(config); auto maybe_client = repl_state_.RegisterReplica(config);
if (maybe_client.HasError()) { if (maybe_client.HasError()) {
switch (maybe_client.GetError()) { switch (maybe_client.GetError()) {
@ -159,7 +165,6 @@ struct ReplicationHandler : public memgraph::query::ReplicationQueryHandler {
break; break;
} }
} }
using enum memgraph::flags::Experiments; using enum memgraph::flags::Experiments;
bool system_replication_enabled = flags::AreExperimentsEnabled(SYSTEM_REPLICATION); bool system_replication_enabled = flags::AreExperimentsEnabled(SYSTEM_REPLICATION);
if (!system_replication_enabled && dbms_handler_.Count() > 1) { if (!system_replication_enabled && dbms_handler_.Count() > 1) {
@ -167,25 +172,21 @@ struct ReplicationHandler : public memgraph::query::ReplicationQueryHandler {
} }
const auto main_uuid = const auto main_uuid =
std::get<memgraph::replication::RoleMainData>(dbms_handler_.ReplicationState().ReplicationData()).uuid_; std::get<memgraph::replication::RoleMainData>(dbms_handler_.ReplicationState().ReplicationData()).uuid_;
if (send_swap_uuid) { if (send_swap_uuid) {
if (!memgraph::replication_coordination_glue::SendSwapMainUUIDRpc(maybe_client.GetValue()->rpc_client_, if (!memgraph::replication_coordination_glue::SendSwapMainUUIDRpc(maybe_client.GetValue()->rpc_client_,
main_uuid)) { main_uuid)) {
return memgraph::query::RegisterReplicaError::ERROR_ACCEPTING_MAIN; return memgraph::query::RegisterReplicaError::ERROR_ACCEPTING_MAIN;
} }
} }
#ifdef MG_ENTERPRISE #ifdef MG_ENTERPRISE
// Update system before enabling individual storage <-> replica clients // Update system before enabling individual storage <-> replica clients
SystemRestore(*maybe_client.GetValue(), system_, dbms_handler_, main_uuid, auth_); SystemRestore(*maybe_client.GetValue(), system_, dbms_handler_, main_uuid, auth_);
#endif #endif
const auto dbms_error = HandleRegisterReplicaStatus(maybe_client); const auto dbms_error = HandleRegisterReplicaStatus(maybe_client);
if (dbms_error.has_value()) { if (dbms_error.has_value()) {
return *dbms_error; return *dbms_error;
} }
auto &instance_client_ptr = maybe_client.GetValue(); auto &instance_client_ptr = maybe_client.GetValue();
bool all_clients_good = true; bool all_clients_good = true;
// Add database specific clients (NOTE Currently all databases are connected to each replica) // Add database specific clients (NOTE Currently all databases are connected to each replica)
dbms_handler_.ForEach([&](dbms::DatabaseAccess db_acc) { dbms_handler_.ForEach([&](dbms::DatabaseAccess db_acc) {
@ -195,7 +196,6 @@ struct ReplicationHandler : public memgraph::query::ReplicationQueryHandler {
} }
// TODO: ATM only IN_MEMORY_TRANSACTIONAL, fix other modes // TODO: ATM only IN_MEMORY_TRANSACTIONAL, fix other modes
if (storage->storage_mode_ != storage::StorageMode::IN_MEMORY_TRANSACTIONAL) return; if (storage->storage_mode_ != storage::StorageMode::IN_MEMORY_TRANSACTIONAL) return;
all_clients_good &= storage->repl_storage_state_.replication_clients_.WithLock( all_clients_good &= storage->repl_storage_state_.replication_clients_.WithLock(
[storage, &instance_client_ptr, db_acc = std::move(db_acc), [storage, &instance_client_ptr, db_acc = std::move(db_acc),
main_uuid](auto &storage_clients) mutable { // NOLINT main_uuid](auto &storage_clients) mutable { // NOLINT
@ -203,9 +203,12 @@ struct ReplicationHandler : public memgraph::query::ReplicationQueryHandler {
client->Start(storage, std::move(db_acc)); client->Start(storage, std::move(db_acc));
bool const success = std::invoke([state = client->State()]() { bool const success = std::invoke([state = client->State()]() {
if (state == storage::replication::ReplicaState::DIVERGED_FROM_MAIN) { if (state == storage::replication::ReplicaState::DIVERGED_FROM_MAIN) {
return AllowReplicaToDivergeFromMain; return false;
} }
return state != storage::replication::ReplicaState::MAYBE_BEHIND; if (state == storage::replication::ReplicaState::MAYBE_BEHIND) {
return AllowRPCFailure;
}
return true;
}); });
if (success) { if (success) {
@ -214,14 +217,12 @@ struct ReplicationHandler : public memgraph::query::ReplicationQueryHandler {
return success; return success;
}); });
}); });
// NOTE Currently if any databases fails, we revert back // NOTE Currently if any databases fails, we revert back
if (!all_clients_good) { if (!all_clients_good) {
spdlog::error("Failed to register all databases on the REPLICA \"{}\"", config.name); spdlog::error("Failed to register all databases on the REPLICA \"{}\"", config.name);
UnregisterReplica(config.name); UnregisterReplica(config.name);
return memgraph::query::RegisterReplicaError::CONNECTION_FAILED; return memgraph::query::RegisterReplicaError::CONNECTION_FAILED;
} }
// No client error, start instance level client // No client error, start instance level client
#ifdef MG_ENTERPRISE #ifdef MG_ENTERPRISE
StartReplicaClient(*instance_client_ptr, system_, dbms_handler_, main_uuid, auth_); StartReplicaClient(*instance_client_ptr, system_, dbms_handler_, main_uuid, auth_);
@ -231,6 +232,57 @@ struct ReplicationHandler : public memgraph::query::ReplicationQueryHandler {
return {}; return {};
} }
template <bool AllowIdempotency>
bool SetReplicationRoleReplica_(const memgraph::replication::ReplicationServerConfig &config,
const std::optional<utils::UUID> &main_uuid) {
if (repl_state_.IsReplica()) {
if (!AllowIdempotency) {
return false;
}
// We don't want to restart the server if we're already a REPLICA with correct config
auto &replica_data = std::get<memgraph::replication::RoleReplicaData>(repl_state_.ReplicationData());
if (replica_data.config == config) {
return true;
}
repl_state_.SetReplicationRoleReplica(config, main_uuid);
#ifdef MG_ENTERPRISE
return StartRpcServer(dbms_handler_, replica_data, auth_, system_);
#else
return StartRpcServer(dbms_handler_, replica_data);
#endif
}
// TODO StorageState needs to be synched. Could have a dangling reference if someone adds a database as we are
// deleting the replica.
// Remove database specific clients
dbms_handler_.ForEach([&](memgraph::dbms::DatabaseAccess db_acc) {
auto *storage = db_acc->storage();
storage->repl_storage_state_.replication_clients_.WithLock([](auto &clients) { clients.clear(); });
});
// Remove instance level clients
std::get<memgraph::replication::RoleMainData>(repl_state_.ReplicationData()).registered_replicas_.clear();
// Creates the server
repl_state_.SetReplicationRoleReplica(config, main_uuid);
// Start
const auto success =
std::visit(memgraph::utils::Overloaded{[](memgraph::replication::RoleMainData &) {
// ASSERT
return false;
},
[this](memgraph::replication::RoleReplicaData &data) {
#ifdef MG_ENTERPRISE
return StartRpcServer(dbms_handler_, data, auth_, system_);
#else
return StartRpcServer(dbms_handler_, data);
#endif
}},
repl_state_.ReplicationData());
// TODO Handle error (restore to main?)
return success;
}
memgraph::replication::ReplicationState &repl_state_; memgraph::replication::ReplicationState &repl_state_;
memgraph::dbms::DbmsHandler &dbms_handler_; memgraph::dbms::DbmsHandler &dbms_handler_;

View File

@ -192,41 +192,12 @@ bool ReplicationHandler::SetReplicationRoleMain() {
bool ReplicationHandler::SetReplicationRoleReplica(const memgraph::replication::ReplicationServerConfig &config, bool ReplicationHandler::SetReplicationRoleReplica(const memgraph::replication::ReplicationServerConfig &config,
const std::optional<utils::UUID> &main_uuid) { const std::optional<utils::UUID> &main_uuid) {
// We don't want to restart the server if we're already a REPLICA return SetReplicationRoleReplica_<false>(config, main_uuid);
if (repl_state_.IsReplica()) {
spdlog::trace("Instance has already has replica role.");
return false;
} }
// TODO StorageState needs to be synched. Could have a dangling reference if someone adds a database as we are bool ReplicationHandler::TrySetReplicationRoleReplica(const memgraph::replication::ReplicationServerConfig &config,
// deleting the replica. const std::optional<utils::UUID> &main_uuid) {
// Remove database specific clients return SetReplicationRoleReplica_<true>(config, main_uuid);
dbms_handler_.ForEach([&](memgraph::dbms::DatabaseAccess db_acc) {
auto *storage = db_acc->storage();
storage->repl_storage_state_.replication_clients_.WithLock([](auto &clients) { clients.clear(); });
});
// Remove instance level clients
std::get<memgraph::replication::RoleMainData>(repl_state_.ReplicationData()).registered_replicas_.clear();
// Creates the server
repl_state_.SetReplicationRoleReplica(config, main_uuid);
// Start
const auto success =
std::visit(memgraph::utils::Overloaded{[](memgraph::replication::RoleMainData &) {
// ASSERT
return false;
},
[this](memgraph::replication::RoleReplicaData &data) {
#ifdef MG_ENTERPRISE
return StartRpcServer(dbms_handler_, data, auth_, system_);
#else
return StartRpcServer(dbms_handler_, data);
#endif
}},
repl_state_.ReplicationData());
// TODO Handle error (restore to main?)
return success;
} }
bool ReplicationHandler::DoReplicaToMainPromotion(const utils::UUID &main_uuid) { bool ReplicationHandler::DoReplicaToMainPromotion(const utils::UUID &main_uuid) {
@ -258,13 +229,13 @@ bool ReplicationHandler::DoReplicaToMainPromotion(const utils::UUID &main_uuid)
auto ReplicationHandler::TryRegisterReplica(const memgraph::replication::ReplicationClientConfig &config, auto ReplicationHandler::TryRegisterReplica(const memgraph::replication::ReplicationClientConfig &config,
bool send_swap_uuid) bool send_swap_uuid)
-> memgraph::utils::BasicResult<memgraph::query::RegisterReplicaError> { -> memgraph::utils::BasicResult<memgraph::query::RegisterReplicaError> {
return RegisterReplica_<false>(config, send_swap_uuid); return RegisterReplica_<true>(config, send_swap_uuid);
} }
auto ReplicationHandler::RegisterReplica(const memgraph::replication::ReplicationClientConfig &config, auto ReplicationHandler::RegisterReplica(const memgraph::replication::ReplicationClientConfig &config,
bool send_swap_uuid) bool send_swap_uuid)
-> memgraph::utils::BasicResult<memgraph::query::RegisterReplicaError> { -> memgraph::utils::BasicResult<memgraph::query::RegisterReplicaError> {
return RegisterReplica_<true>(config, send_swap_uuid); return RegisterReplica_<false>(config, send_swap_uuid);
} }
auto ReplicationHandler::UnregisterReplica(std::string_view name) -> memgraph::query::UnregisterReplicaResult { auto ReplicationHandler::UnregisterReplica(std::string_view name) -> memgraph::query::UnregisterReplicaResult {
@ -297,6 +268,11 @@ auto ReplicationHandler::GetRole() const -> memgraph::replication_coordination_g
return repl_state_.GetRole(); return repl_state_.GetRole();
} }
auto ReplicationHandler::GetReplicaUUID() -> std::optional<utils::UUID> {
MG_ASSERT(repl_state_.IsReplica());
return std::get<RoleReplicaData>(repl_state_.ReplicationData()).uuid_;
}
auto ReplicationHandler::GetReplState() const -> const memgraph::replication::ReplicationState & { return repl_state_; } auto ReplicationHandler::GetReplState() const -> const memgraph::replication::ReplicationState & { return repl_state_; }
auto ReplicationHandler::GetReplState() -> memgraph::replication::ReplicationState & { return repl_state_; } auto ReplicationHandler::GetReplState() -> memgraph::replication::ReplicationState & { return repl_state_; }

View File

@ -112,6 +112,9 @@ enum class TypeId : uint64_t {
COORD_ENABLE_WRITING_ON_MAIN_REQ, COORD_ENABLE_WRITING_ON_MAIN_REQ,
COORD_ENABLE_WRITING_ON_MAIN_RES, COORD_ENABLE_WRITING_ON_MAIN_RES,
COORD_GET_UUID_REQ,
COORD_GET_UUID_RES,
// AST // AST
AST_LABELIX = 3000, AST_LABELIX = 3000,
AST_PROPERTYIX, AST_PROPERTYIX,

View File

@ -71,6 +71,7 @@ startup_config_dict = {
"raft_server_id": ("0", "0", "Unique ID of the raft server."), "raft_server_id": ("0", "0", "Unique ID of the raft server."),
"instance_down_timeout_sec": ("5", "5", "Time duration after which an instance is considered down."), "instance_down_timeout_sec": ("5", "5", "Time duration after which an instance is considered down."),
"instance_health_check_frequency_sec": ("1", "1", "The time duration between two health checks/pings."), "instance_health_check_frequency_sec": ("1", "1", "The time duration between two health checks/pings."),
"instance_get_uuid_frequency_sec": ("10", "10", "The time duration between two instance uuid checks."),
"data_directory": ("mg_data", "mg_data", "Path to directory in which to save all permanent data."), "data_directory": ("mg_data", "mg_data", "Path to directory in which to save all permanent data."),
"data_recovery_on_startup": ( "data_recovery_on_startup": (
"false", "false",

View File

@ -10,11 +10,13 @@
# licenses/APL.txt. # licenses/APL.txt.
import os import os
import shutil
import sys import sys
import tempfile
import interactive_mg_runner import interactive_mg_runner
import pytest import pytest
from common import execute_and_fetch_all from common import execute_and_fetch_all, safe_execute
from mg_utils import mg_sleep_and_assert from mg_utils import mg_sleep_and_assert
interactive_mg_runner.SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__)) interactive_mg_runner.SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__))
@ -38,7 +40,7 @@ MEMGRAPH_FIRST_CLUSTER_DESCRIPTION = {
} }
MEMGRAPH_INSTANCES_DESCRIPTION = { MEMGRAPH_SECOND_CLUSTER_DESCRIPTION = {
"replica": { "replica": {
"args": ["--bolt-port", "7689", "--log-level", "TRACE"], "args": ["--bolt-port", "7689", "--log-level", "TRACE"],
"log_file": "replica.log", "log_file": "replica.log",
@ -71,7 +73,7 @@ def test_replication_works_on_failover(connection):
assert actual_data_on_main == expected_data_on_main assert actual_data_on_main == expected_data_on_main
# 3 # 3
interactive_mg_runner.start_all_keep_others(MEMGRAPH_INSTANCES_DESCRIPTION) interactive_mg_runner.start_all_keep_others(MEMGRAPH_SECOND_CLUSTER_DESCRIPTION)
# 4 # 4
new_main_cursor = connection(7690, "main_2").cursor() new_main_cursor = connection(7690, "main_2").cursor()
@ -113,5 +115,144 @@ def test_replication_works_on_failover(connection):
interactive_mg_runner.stop_all() interactive_mg_runner.stop_all()
def test_not_replicate_old_main_register_new_cluster(connection):
# Goal of this test is to check that although replica is registered in one cluster
# it can be re-registered to new cluster
# This flow checks if Registering replica is idempotent and that old main cannot talk to replica
# 1. We start all replicas and main in one cluster
# 2. Main from first cluster can see all replicas
# 3. We start all replicas and main in second cluster, by reusing one replica from first cluster
# 4. New main should see replica. Registration should pass (idempotent registration)
# 5. Old main should not talk to new replica
# 6. New main should talk to replica
TEMP_DIR = tempfile.TemporaryDirectory().name
MEMGRAPH_FISRT_COORD_CLUSTER_DESCRIPTION = {
"shared_instance": {
"args": [
"--bolt-port",
"7688",
"--log-level",
"TRACE",
"--coordinator-server-port",
"10011",
],
"log_file": "instance_1.log",
"data_directory": f"{TEMP_DIR}/shared_instance",
"setup_queries": [],
},
"instance_2": {
"args": [
"--bolt-port",
"7689",
"--log-level",
"TRACE",
"--coordinator-server-port",
"10012",
],
"log_file": "instance_2.log",
"data_directory": f"{TEMP_DIR}/instance_2",
"setup_queries": [],
},
"coordinator_1": {
"args": ["--bolt-port", "7690", "--log-level=TRACE", "--raft-server-id=1", "--raft-server-port=10111"],
"log_file": "coordinator.log",
"setup_queries": [
"REGISTER INSTANCE shared_instance ON '127.0.0.1:10011' WITH '127.0.0.1:10001';",
"REGISTER INSTANCE instance_2 ON '127.0.0.1:10012' WITH '127.0.0.1:10002';",
"SET INSTANCE instance_2 TO MAIN",
],
},
}
# 1
interactive_mg_runner.start_all_keep_others(MEMGRAPH_FISRT_COORD_CLUSTER_DESCRIPTION)
# 2
first_cluster_coord_cursor = connection(7690, "coord_1").cursor()
def show_repl_cluster():
return sorted(list(execute_and_fetch_all(first_cluster_coord_cursor, "SHOW INSTANCES;")))
expected_data_up_first_cluster = [
("coordinator_1", "127.0.0.1:10111", "", True, "coordinator"),
("instance_2", "", "127.0.0.1:10012", True, "main"),
("shared_instance", "", "127.0.0.1:10011", True, "replica"),
]
mg_sleep_and_assert(expected_data_up_first_cluster, show_repl_cluster)
# 3
MEMGRAPH_SECOND_COORD_CLUSTER_DESCRIPTION = {
"instance_3": {
"args": [
"--bolt-port",
"7687",
"--log-level",
"TRACE",
"--coordinator-server-port",
"10013",
],
"log_file": "instance_3.log",
"data_directory": f"{TEMP_DIR}/instance_3",
"setup_queries": [],
},
"coordinator_2": {
"args": ["--bolt-port", "7691", "--log-level=TRACE", "--raft-server-id=1", "--raft-server-port=10112"],
"log_file": "coordinator.log",
"setup_queries": [],
},
}
interactive_mg_runner.start_all_keep_others(MEMGRAPH_SECOND_COORD_CLUSTER_DESCRIPTION)
second_cluster_coord_cursor = connection(7691, "coord_2").cursor()
execute_and_fetch_all(
second_cluster_coord_cursor, "REGISTER INSTANCE shared_instance ON '127.0.0.1:10011' WITH '127.0.0.1:10001';"
)
execute_and_fetch_all(
second_cluster_coord_cursor, "REGISTER INSTANCE instance_3 ON '127.0.0.1:10013' WITH '127.0.0.1:10003';"
)
execute_and_fetch_all(second_cluster_coord_cursor, "SET INSTANCE instance_3 TO MAIN")
# 4
def show_repl_cluster():
return sorted(list(execute_and_fetch_all(second_cluster_coord_cursor, "SHOW INSTANCES;")))
expected_data_up_second_cluster = [
("coordinator_1", "127.0.0.1:10112", "", True, "coordinator"),
("instance_3", "", "127.0.0.1:10013", True, "main"),
("shared_instance", "", "127.0.0.1:10011", True, "replica"),
]
mg_sleep_and_assert(expected_data_up_second_cluster, show_repl_cluster)
# 5
main_1_cursor = connection(7689, "main_1").cursor()
with pytest.raises(Exception) as e:
execute_and_fetch_all(main_1_cursor, "CREATE ();")
assert (
str(e.value)
== "Replication Exception: At least one SYNC replica has not confirmed committing last transaction. Check the status of the replicas using 'SHOW REPLICAS' query."
)
shared_replica_cursor = connection(7688, "shared_replica").cursor()
res = execute_and_fetch_all(shared_replica_cursor, "MATCH (n) RETURN count(n);")[0][0]
assert res == 0, "Old main should not replicate to 'shared' replica"
# 6
main_2_cursor = connection(7687, "main_2").cursor()
execute_and_fetch_all(main_2_cursor, "CREATE ();")
shared_replica_cursor = connection(7688, "shared_replica").cursor()
res = execute_and_fetch_all(shared_replica_cursor, "MATCH (n) RETURN count(n);")[0][0]
assert res == 1, "New main should replicate to 'shared' replica"
interactive_mg_runner.stop_all()
if __name__ == "__main__": if __name__ == "__main__":
sys.exit(pytest.main([__file__, "-rA"])) sys.exit(pytest.main([__file__, "-rA"]))

View File

@ -147,6 +147,105 @@ def test_replication_works_on_failover():
interactive_mg_runner.stop_all(MEMGRAPH_INSTANCES_DESCRIPTION) interactive_mg_runner.stop_all(MEMGRAPH_INSTANCES_DESCRIPTION)
def test_replication_works_on_replica_instance_restart():
# Goal of this test is to check the replication works after replica goes down and restarts
# 1. We start all replicas, main and coordinator manually: we want to be able to kill them ourselves without relying on external tooling to kill processes.
# 2. We check that main has correct state
# 3. We kill replica
# 4. We check that main cannot replicate to replica
# 5. We bring replica back up
# 6. We check that replica gets data
safe_execute(shutil.rmtree, TEMP_DIR)
# 1
interactive_mg_runner.start_all(MEMGRAPH_INSTANCES_DESCRIPTION)
# 2
main_cursor = connect(host="localhost", port=7687).cursor()
expected_data_on_main = [
("instance_1", "127.0.0.1:10001", "sync", 0, 0, "ready"),
("instance_2", "127.0.0.1:10002", "sync", 0, 0, "ready"),
]
actual_data_on_main = sorted(list(execute_and_fetch_all(main_cursor, "SHOW REPLICAS;")))
assert actual_data_on_main == expected_data_on_main
# 3
coord_cursor = connect(host="localhost", port=7690).cursor()
interactive_mg_runner.kill(MEMGRAPH_INSTANCES_DESCRIPTION, "instance_2")
def retrieve_data_show_repl_cluster():
return sorted(list(execute_and_fetch_all(coord_cursor, "SHOW INSTANCES;")))
expected_data_on_coord = [
("coordinator_1", "127.0.0.1:10111", "", True, "coordinator"),
("instance_1", "", "127.0.0.1:10011", True, "replica"),
("instance_2", "", "127.0.0.1:10012", False, "unknown"),
("instance_3", "", "127.0.0.1:10013", True, "main"),
]
mg_sleep_and_assert(expected_data_on_coord, retrieve_data_show_repl_cluster)
def retrieve_data_show_replicas():
return sorted(list(execute_and_fetch_all(main_cursor, "SHOW REPLICAS;")))
expected_data_on_main = [
("instance_1", "127.0.0.1:10001", "sync", 0, 0, "ready"),
("instance_2", "127.0.0.1:10002", "sync", 0, 0, "invalid"),
]
mg_sleep_and_assert(expected_data_on_main, retrieve_data_show_replicas)
# 4
instance_1_cursor = connect(host="localhost", port=7688).cursor()
with pytest.raises(Exception) as e:
execute_and_fetch_all(main_cursor, "CREATE ();")
assert (
str(e.value)
== "Replication Exception: At least one SYNC replica has not confirmed committing last transaction. Check the status of the replicas using 'SHOW REPLICAS' query."
)
res_instance_1 = execute_and_fetch_all(instance_1_cursor, "MATCH (n) RETURN count(n)")[0][0]
assert res_instance_1 == 1
def retrieve_data_show_replicas():
return sorted(list(execute_and_fetch_all(main_cursor, "SHOW REPLICAS;")))
expected_data_on_main = [
("instance_1", "127.0.0.1:10001", "sync", 2, 0, "ready"),
("instance_2", "127.0.0.1:10002", "sync", 0, 0, "invalid"),
]
mg_sleep_and_assert(expected_data_on_main, retrieve_data_show_replicas)
# 5.
interactive_mg_runner.start(MEMGRAPH_INSTANCES_DESCRIPTION, "instance_2")
def retrieve_data_show_repl_cluster():
return sorted(list(execute_and_fetch_all(coord_cursor, "SHOW INSTANCES;")))
expected_data_on_coord = [
("coordinator_1", "127.0.0.1:10111", "", True, "coordinator"),
("instance_1", "", "127.0.0.1:10011", True, "replica"),
("instance_2", "", "127.0.0.1:10012", True, "replica"),
("instance_3", "", "127.0.0.1:10013", True, "main"),
]
mg_sleep_and_assert(expected_data_on_coord, retrieve_data_show_repl_cluster)
def retrieve_data_show_replicas():
return sorted(list(execute_and_fetch_all(main_cursor, "SHOW REPLICAS;")))
expected_data_on_main = [
("instance_1", "127.0.0.1:10001", "sync", 2, 0, "ready"),
("instance_2", "127.0.0.1:10002", "sync", 2, 0, "ready"),
]
mg_sleep_and_assert(expected_data_on_main, retrieve_data_show_replicas)
# 6.
instance_2_cursor = connect(port=7689, host="localhost").cursor()
execute_and_fetch_all(main_cursor, "CREATE ();")
res_instance_2 = execute_and_fetch_all(instance_2_cursor, "MATCH (n) RETURN count(n)")[0][0]
assert res_instance_2 == 2
def test_show_instances(): def test_show_instances():
safe_execute(shutil.rmtree, TEMP_DIR) safe_execute(shutil.rmtree, TEMP_DIR)
interactive_mg_runner.start_all(MEMGRAPH_INSTANCES_DESCRIPTION) interactive_mg_runner.start_all(MEMGRAPH_INSTANCES_DESCRIPTION)

View File

@ -142,7 +142,7 @@ TEST_F(ReplicationTest, BasicSynchronousReplicationTest) {
MinMemgraph replica(repl_conf); MinMemgraph replica(repl_conf);
auto replica_store_handler = replica.repl_handler; auto replica_store_handler = replica.repl_handler;
replica_store_handler.SetReplicationRoleReplica( replica_store_handler.TrySetReplicationRoleReplica(
ReplicationServerConfig{ ReplicationServerConfig{
.ip_address = local_host, .ip_address = local_host,
.port = ports[0], .port = ports[0],
@ -439,13 +439,13 @@ TEST_F(ReplicationTest, MultipleSynchronousReplicationTest) {
MinMemgraph replica1(repl_conf); MinMemgraph replica1(repl_conf);
MinMemgraph replica2(repl2_conf); MinMemgraph replica2(repl2_conf);
replica1.repl_handler.SetReplicationRoleReplica( replica1.repl_handler.TrySetReplicationRoleReplica(
ReplicationServerConfig{ ReplicationServerConfig{
.ip_address = local_host, .ip_address = local_host,
.port = ports[0], .port = ports[0],
}, },
std::nullopt); std::nullopt);
replica2.repl_handler.SetReplicationRoleReplica( replica2.repl_handler.TrySetReplicationRoleReplica(
ReplicationServerConfig{ ReplicationServerConfig{
.ip_address = local_host, .ip_address = local_host,
.port = ports[1], .port = ports[1],
@ -597,7 +597,7 @@ TEST_F(ReplicationTest, RecoveryProcess) {
MinMemgraph replica(repl_conf); MinMemgraph replica(repl_conf);
auto replica_store_handler = replica.repl_handler; auto replica_store_handler = replica.repl_handler;
replica_store_handler.SetReplicationRoleReplica( replica_store_handler.TrySetReplicationRoleReplica(
ReplicationServerConfig{ ReplicationServerConfig{
.ip_address = local_host, .ip_address = local_host,
.port = ports[0], .port = ports[0],
@ -676,7 +676,7 @@ TEST_F(ReplicationTest, BasicAsynchronousReplicationTest) {
MinMemgraph replica_async(repl_conf); MinMemgraph replica_async(repl_conf);
auto replica_store_handler = replica_async.repl_handler; auto replica_store_handler = replica_async.repl_handler;
replica_store_handler.SetReplicationRoleReplica( replica_store_handler.TrySetReplicationRoleReplica(
ReplicationServerConfig{ ReplicationServerConfig{
.ip_address = local_host, .ip_address = local_host,
.port = ports[1], .port = ports[1],
@ -726,7 +726,7 @@ TEST_F(ReplicationTest, EpochTest) {
MinMemgraph main(main_conf); MinMemgraph main(main_conf);
MinMemgraph replica1(repl_conf); MinMemgraph replica1(repl_conf);
replica1.repl_handler.SetReplicationRoleReplica( replica1.repl_handler.TrySetReplicationRoleReplica(
ReplicationServerConfig{ ReplicationServerConfig{
.ip_address = local_host, .ip_address = local_host,
.port = ports[0], .port = ports[0],
@ -734,7 +734,7 @@ TEST_F(ReplicationTest, EpochTest) {
std::nullopt); std::nullopt);
MinMemgraph replica2(repl2_conf); MinMemgraph replica2(repl2_conf);
replica2.repl_handler.SetReplicationRoleReplica( replica2.repl_handler.TrySetReplicationRoleReplica(
ReplicationServerConfig{ ReplicationServerConfig{
.ip_address = local_host, .ip_address = local_host,
.port = 10001, .port = 10001,
@ -819,7 +819,7 @@ TEST_F(ReplicationTest, EpochTest) {
ASSERT_FALSE(acc->Commit().HasError()); ASSERT_FALSE(acc->Commit().HasError());
} }
replica1.repl_handler.SetReplicationRoleReplica( replica1.repl_handler.TrySetReplicationRoleReplica(
ReplicationServerConfig{ ReplicationServerConfig{
.ip_address = local_host, .ip_address = local_host,
.port = ports[0], .port = ports[0],
@ -858,7 +858,7 @@ TEST_F(ReplicationTest, ReplicationInformation) {
MinMemgraph replica1(repl_conf); MinMemgraph replica1(repl_conf);
uint16_t replica1_port = 10001; uint16_t replica1_port = 10001;
replica1.repl_handler.SetReplicationRoleReplica( replica1.repl_handler.TrySetReplicationRoleReplica(
ReplicationServerConfig{ ReplicationServerConfig{
.ip_address = local_host, .ip_address = local_host,
.port = replica1_port, .port = replica1_port,
@ -867,7 +867,7 @@ TEST_F(ReplicationTest, ReplicationInformation) {
uint16_t replica2_port = 10002; uint16_t replica2_port = 10002;
MinMemgraph replica2(repl2_conf); MinMemgraph replica2(repl2_conf);
replica2.repl_handler.SetReplicationRoleReplica( replica2.repl_handler.TrySetReplicationRoleReplica(
ReplicationServerConfig{ ReplicationServerConfig{
.ip_address = local_host, .ip_address = local_host,
.port = replica2_port, .port = replica2_port,
@ -923,7 +923,7 @@ TEST_F(ReplicationTest, ReplicationReplicaWithExistingName) {
MinMemgraph replica1(repl_conf); MinMemgraph replica1(repl_conf);
uint16_t replica1_port = 10001; uint16_t replica1_port = 10001;
replica1.repl_handler.SetReplicationRoleReplica( replica1.repl_handler.TrySetReplicationRoleReplica(
ReplicationServerConfig{ ReplicationServerConfig{
.ip_address = local_host, .ip_address = local_host,
.port = replica1_port, .port = replica1_port,
@ -932,7 +932,7 @@ TEST_F(ReplicationTest, ReplicationReplicaWithExistingName) {
uint16_t replica2_port = 10002; uint16_t replica2_port = 10002;
MinMemgraph replica2(repl2_conf); MinMemgraph replica2(repl2_conf);
replica2.repl_handler.SetReplicationRoleReplica( replica2.repl_handler.TrySetReplicationRoleReplica(
ReplicationServerConfig{ ReplicationServerConfig{
.ip_address = local_host, .ip_address = local_host,
.port = replica2_port, .port = replica2_port,
@ -966,7 +966,7 @@ TEST_F(ReplicationTest, ReplicationReplicaWithExistingEndPoint) {
MinMemgraph main(main_conf); MinMemgraph main(main_conf);
MinMemgraph replica1(repl_conf); MinMemgraph replica1(repl_conf);
replica1.repl_handler.SetReplicationRoleReplica( replica1.repl_handler.TrySetReplicationRoleReplica(
ReplicationServerConfig{ ReplicationServerConfig{
.ip_address = local_host, .ip_address = local_host,
.port = common_port, .port = common_port,
@ -974,7 +974,7 @@ TEST_F(ReplicationTest, ReplicationReplicaWithExistingEndPoint) {
std::nullopt); std::nullopt);
MinMemgraph replica2(repl2_conf); MinMemgraph replica2(repl2_conf);
replica2.repl_handler.SetReplicationRoleReplica( replica2.repl_handler.TrySetReplicationRoleReplica(
ReplicationServerConfig{ ReplicationServerConfig{
.ip_address = local_host, .ip_address = local_host,
.port = common_port, .port = common_port,
@ -1023,7 +1023,7 @@ TEST_F(ReplicationTest, RestoringReplicationAtStartupAfterDroppingReplica) {
std::optional<MinMemgraph> main(main_config); std::optional<MinMemgraph> main(main_config);
MinMemgraph replica1(replica1_config); MinMemgraph replica1(replica1_config);
replica1.repl_handler.SetReplicationRoleReplica( replica1.repl_handler.TrySetReplicationRoleReplica(
ReplicationServerConfig{ ReplicationServerConfig{
.ip_address = local_host, .ip_address = local_host,
.port = ports[0], .port = ports[0],
@ -1031,7 +1031,7 @@ TEST_F(ReplicationTest, RestoringReplicationAtStartupAfterDroppingReplica) {
std::nullopt); std::nullopt);
MinMemgraph replica2(replica2_config); MinMemgraph replica2(replica2_config);
replica2.repl_handler.SetReplicationRoleReplica( replica2.repl_handler.TrySetReplicationRoleReplica(
ReplicationServerConfig{ ReplicationServerConfig{
.ip_address = local_host, .ip_address = local_host,
.port = ports[1], .port = ports[1],
@ -1088,7 +1088,7 @@ TEST_F(ReplicationTest, RestoringReplicationAtStartup) {
std::optional<MinMemgraph> main(main_config); std::optional<MinMemgraph> main(main_config);
MinMemgraph replica1(repl_conf); MinMemgraph replica1(repl_conf);
replica1.repl_handler.SetReplicationRoleReplica( replica1.repl_handler.TrySetReplicationRoleReplica(
ReplicationServerConfig{ ReplicationServerConfig{
.ip_address = local_host, .ip_address = local_host,
.port = ports[0], .port = ports[0],
@ -1097,7 +1097,7 @@ TEST_F(ReplicationTest, RestoringReplicationAtStartup) {
MinMemgraph replica2(repl2_conf); MinMemgraph replica2(repl2_conf);
replica2.repl_handler.SetReplicationRoleReplica( replica2.repl_handler.TrySetReplicationRoleReplica(
ReplicationServerConfig{ ReplicationServerConfig{
.ip_address = local_host, .ip_address = local_host,
.port = ports[1], .port = ports[1],