HA: Polish flow for replicas from coordinator (#1711)
This commit is contained in:
parent
5f2e3f01d0
commit
bfc756c092
@ -15,6 +15,7 @@ target_sources(mg-coordination
|
|||||||
include/coordination/instance_status.hpp
|
include/coordination/instance_status.hpp
|
||||||
include/coordination/replication_instance.hpp
|
include/coordination/replication_instance.hpp
|
||||||
include/coordination/raft_state.hpp
|
include/coordination/raft_state.hpp
|
||||||
|
include/coordination/rpc_errors.hpp
|
||||||
|
|
||||||
include/nuraft/coordinator_log_store.hpp
|
include/nuraft/coordinator_log_store.hpp
|
||||||
include/nuraft/coordinator_state_machine.hpp
|
include/nuraft/coordinator_state_machine.hpp
|
||||||
|
@ -17,6 +17,7 @@
|
|||||||
#include "coordination/coordinator_config.hpp"
|
#include "coordination/coordinator_config.hpp"
|
||||||
#include "coordination/coordinator_rpc.hpp"
|
#include "coordination/coordinator_rpc.hpp"
|
||||||
#include "replication_coordination_glue/messages.hpp"
|
#include "replication_coordination_glue/messages.hpp"
|
||||||
|
#include "utils/result.hpp"
|
||||||
|
|
||||||
namespace memgraph::coordination {
|
namespace memgraph::coordination {
|
||||||
|
|
||||||
@ -45,6 +46,10 @@ auto CoordinatorClient::InstanceDownTimeoutSec() const -> std::chrono::seconds {
|
|||||||
return config_.instance_down_timeout_sec;
|
return config_.instance_down_timeout_sec;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
auto CoordinatorClient::InstanceGetUUIDFrequencySec() const -> std::chrono::seconds {
|
||||||
|
return config_.instance_get_uuid_frequency_sec;
|
||||||
|
}
|
||||||
|
|
||||||
void CoordinatorClient::StartFrequentCheck() {
|
void CoordinatorClient::StartFrequentCheck() {
|
||||||
if (instance_checker_.IsRunning()) {
|
if (instance_checker_.IsRunning()) {
|
||||||
return;
|
return;
|
||||||
@ -140,6 +145,18 @@ auto CoordinatorClient::SendUnregisterReplicaRpc(std::string const &instance_nam
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
auto CoordinatorClient::SendGetInstanceUUIDRpc() const
|
||||||
|
-> utils::BasicResult<GetInstanceUUIDError, std::optional<utils::UUID>> {
|
||||||
|
try {
|
||||||
|
auto stream{rpc_client_.Stream<GetInstanceUUIDRpc>()};
|
||||||
|
auto res = stream.AwaitResponse();
|
||||||
|
return res.uuid;
|
||||||
|
} catch (const rpc::RpcFailedException &) {
|
||||||
|
spdlog::error("RPC error occured while sending GetInstance UUID RPC");
|
||||||
|
return GetInstanceUUIDError::RPC_EXCEPTION;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
auto CoordinatorClient::SendEnableWritingOnMainRpc() const -> bool {
|
auto CoordinatorClient::SendEnableWritingOnMainRpc() const -> bool {
|
||||||
try {
|
try {
|
||||||
auto stream{rpc_client_.Stream<EnableWritingOnMainRpc>()};
|
auto stream{rpc_client_.Stream<EnableWritingOnMainRpc>()};
|
||||||
|
@ -51,6 +51,12 @@ void CoordinatorHandlers::Register(memgraph::coordination::CoordinatorServer &se
|
|||||||
spdlog::info("Received EnableWritingOnMainRpc on coordinator server");
|
spdlog::info("Received EnableWritingOnMainRpc on coordinator server");
|
||||||
CoordinatorHandlers::EnableWritingOnMainHandler(replication_handler, req_reader, res_builder);
|
CoordinatorHandlers::EnableWritingOnMainHandler(replication_handler, req_reader, res_builder);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
server.Register<coordination::GetInstanceUUIDRpc>(
|
||||||
|
[&replication_handler](slk::Reader *req_reader, slk::Builder *res_builder) -> void {
|
||||||
|
spdlog::info("Received GetInstanceUUIDRpc on coordinator server");
|
||||||
|
CoordinatorHandlers::GetInstanceUUIDHandler(replication_handler, req_reader, res_builder);
|
||||||
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
void CoordinatorHandlers::SwapMainUUIDHandler(replication::ReplicationHandler &replication_handler,
|
void CoordinatorHandlers::SwapMainUUIDHandler(replication::ReplicationHandler &replication_handler,
|
||||||
@ -74,12 +80,6 @@ void CoordinatorHandlers::DemoteMainToReplicaHandler(replication::ReplicationHan
|
|||||||
slk::Reader *req_reader, slk::Builder *res_builder) {
|
slk::Reader *req_reader, slk::Builder *res_builder) {
|
||||||
spdlog::info("Executing DemoteMainToReplicaHandler");
|
spdlog::info("Executing DemoteMainToReplicaHandler");
|
||||||
|
|
||||||
if (!replication_handler.IsMain()) {
|
|
||||||
spdlog::error("Setting to replica must be performed on main.");
|
|
||||||
slk::Save(coordination::DemoteMainToReplicaRes{false}, res_builder);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
coordination::DemoteMainToReplicaReq req;
|
coordination::DemoteMainToReplicaReq req;
|
||||||
slk::Load(&req, req_reader);
|
slk::Load(&req, req_reader);
|
||||||
|
|
||||||
@ -89,11 +89,18 @@ void CoordinatorHandlers::DemoteMainToReplicaHandler(replication::ReplicationHan
|
|||||||
|
|
||||||
if (!replication_handler.SetReplicationRoleReplica(clients_config, std::nullopt)) {
|
if (!replication_handler.SetReplicationRoleReplica(clients_config, std::nullopt)) {
|
||||||
spdlog::error("Demoting main to replica failed!");
|
spdlog::error("Demoting main to replica failed!");
|
||||||
slk::Save(coordination::PromoteReplicaToMainRes{false}, res_builder);
|
slk::Save(coordination::DemoteMainToReplicaRes{false}, res_builder);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
slk::Save(coordination::PromoteReplicaToMainRes{true}, res_builder);
|
slk::Save(coordination::DemoteMainToReplicaRes{true}, res_builder);
|
||||||
|
}
|
||||||
|
|
||||||
|
void CoordinatorHandlers::GetInstanceUUIDHandler(replication::ReplicationHandler &replication_handler,
|
||||||
|
slk::Reader * /*req_reader*/, slk::Builder *res_builder) {
|
||||||
|
spdlog::info("Executing GetInstanceUUIDHandler");
|
||||||
|
|
||||||
|
slk::Save(coordination::GetInstanceUUIDRes{replication_handler.GetReplicaUUID()}, res_builder);
|
||||||
}
|
}
|
||||||
|
|
||||||
void CoordinatorHandlers::PromoteReplicaToMainHandler(replication::ReplicationHandler &replication_handler,
|
void CoordinatorHandlers::PromoteReplicaToMainHandler(replication::ReplicationHandler &replication_handler,
|
||||||
|
@ -48,9 +48,12 @@ CoordinatorInstance::CoordinatorInstance()
|
|||||||
spdlog::trace("Instance {} performing replica successful callback", repl_instance_name);
|
spdlog::trace("Instance {} performing replica successful callback", repl_instance_name);
|
||||||
auto &repl_instance = find_repl_instance(self, repl_instance_name);
|
auto &repl_instance = find_repl_instance(self, repl_instance_name);
|
||||||
|
|
||||||
|
// We need to get replicas UUID from time to time to ensure replica is listening to correct main
|
||||||
|
// and that it didn't go down for less time than we could notice
|
||||||
|
// We need to get id of main replica is listening to
|
||||||
|
// and swap if necessary
|
||||||
if (!repl_instance.EnsureReplicaHasCorrectMainUUID(self->GetMainUUID())) {
|
if (!repl_instance.EnsureReplicaHasCorrectMainUUID(self->GetMainUUID())) {
|
||||||
spdlog::error(
|
spdlog::error("Failed to swap uuid for replica instance {} which is alive", repl_instance.InstanceName());
|
||||||
fmt::format("Failed to swap uuid for replica instance {} which is alive", repl_instance.InstanceName()));
|
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -62,14 +65,6 @@ CoordinatorInstance::CoordinatorInstance()
|
|||||||
spdlog::trace("Instance {} performing replica failure callback", repl_instance_name);
|
spdlog::trace("Instance {} performing replica failure callback", repl_instance_name);
|
||||||
auto &repl_instance = find_repl_instance(self, repl_instance_name);
|
auto &repl_instance = find_repl_instance(self, repl_instance_name);
|
||||||
repl_instance.OnFailPing();
|
repl_instance.OnFailPing();
|
||||||
// We need to restart main uuid from instance since it was "down" at least a second
|
|
||||||
// There is slight delay, if we choose to use isAlive, instance can be down and back up in less than
|
|
||||||
// our isAlive time difference, which would lead to instance setting UUID to nullopt and stopping accepting any
|
|
||||||
// incoming RPCs from valid main
|
|
||||||
// TODO(antoniofilipovic) this needs here more complex logic
|
|
||||||
// We need to get id of main replica is listening to on successful ping
|
|
||||||
// and swap it to correct uuid if it failed
|
|
||||||
repl_instance.ResetMainUUID();
|
|
||||||
};
|
};
|
||||||
|
|
||||||
main_succ_cb_ = [find_repl_instance](CoordinatorInstance *self, std::string_view repl_instance_name) -> void {
|
main_succ_cb_ = [find_repl_instance](CoordinatorInstance *self, std::string_view repl_instance_name) -> void {
|
||||||
|
@ -80,6 +80,23 @@ void EnableWritingOnMainReq::Save(EnableWritingOnMainReq const &self, memgraph::
|
|||||||
|
|
||||||
void EnableWritingOnMainReq::Load(EnableWritingOnMainReq *self, memgraph::slk::Reader *reader) {}
|
void EnableWritingOnMainReq::Load(EnableWritingOnMainReq *self, memgraph::slk::Reader *reader) {}
|
||||||
|
|
||||||
|
// GetInstanceUUID
|
||||||
|
void GetInstanceUUIDReq::Save(const GetInstanceUUIDReq &self, memgraph::slk::Builder *builder) {
|
||||||
|
memgraph::slk::Save(self, builder);
|
||||||
|
}
|
||||||
|
|
||||||
|
void GetInstanceUUIDReq::Load(GetInstanceUUIDReq *self, memgraph::slk::Reader *reader) {
|
||||||
|
memgraph::slk::Load(self, reader);
|
||||||
|
}
|
||||||
|
|
||||||
|
void GetInstanceUUIDRes::Save(const GetInstanceUUIDRes &self, memgraph::slk::Builder *builder) {
|
||||||
|
memgraph::slk::Save(self, builder);
|
||||||
|
}
|
||||||
|
|
||||||
|
void GetInstanceUUIDRes::Load(GetInstanceUUIDRes *self, memgraph::slk::Reader *reader) {
|
||||||
|
memgraph::slk::Load(self, reader);
|
||||||
|
}
|
||||||
|
|
||||||
} // namespace coordination
|
} // namespace coordination
|
||||||
|
|
||||||
constexpr utils::TypeInfo coordination::PromoteReplicaToMainReq::kType{utils::TypeId::COORD_FAILOVER_REQ,
|
constexpr utils::TypeInfo coordination::PromoteReplicaToMainReq::kType{utils::TypeId::COORD_FAILOVER_REQ,
|
||||||
@ -107,8 +124,16 @@ constexpr utils::TypeInfo coordination::EnableWritingOnMainReq::kType{utils::Typ
|
|||||||
constexpr utils::TypeInfo coordination::EnableWritingOnMainRes::kType{utils::TypeId::COORD_ENABLE_WRITING_ON_MAIN_RES,
|
constexpr utils::TypeInfo coordination::EnableWritingOnMainRes::kType{utils::TypeId::COORD_ENABLE_WRITING_ON_MAIN_RES,
|
||||||
"CoordEnableWritingOnMainRes", nullptr};
|
"CoordEnableWritingOnMainRes", nullptr};
|
||||||
|
|
||||||
|
constexpr utils::TypeInfo coordination::GetInstanceUUIDReq::kType{utils::TypeId::COORD_GET_UUID_REQ, "CoordGetUUIDReq",
|
||||||
|
nullptr};
|
||||||
|
|
||||||
|
constexpr utils::TypeInfo coordination::GetInstanceUUIDRes::kType{utils::TypeId::COORD_GET_UUID_RES, "CoordGetUUIDRes",
|
||||||
|
nullptr};
|
||||||
|
|
||||||
namespace slk {
|
namespace slk {
|
||||||
|
|
||||||
|
// PromoteReplicaToMainRpc
|
||||||
|
|
||||||
void Save(const memgraph::coordination::PromoteReplicaToMainRes &self, memgraph::slk::Builder *builder) {
|
void Save(const memgraph::coordination::PromoteReplicaToMainRes &self, memgraph::slk::Builder *builder) {
|
||||||
memgraph::slk::Save(self.success, builder);
|
memgraph::slk::Save(self.success, builder);
|
||||||
}
|
}
|
||||||
@ -127,6 +152,7 @@ void Load(memgraph::coordination::PromoteReplicaToMainReq *self, memgraph::slk::
|
|||||||
memgraph::slk::Load(&self->replication_clients_info, reader);
|
memgraph::slk::Load(&self->replication_clients_info, reader);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// DemoteMainToReplicaRpc
|
||||||
void Save(const memgraph::coordination::DemoteMainToReplicaReq &self, memgraph::slk::Builder *builder) {
|
void Save(const memgraph::coordination::DemoteMainToReplicaReq &self, memgraph::slk::Builder *builder) {
|
||||||
memgraph::slk::Save(self.replication_client_info, builder);
|
memgraph::slk::Save(self.replication_client_info, builder);
|
||||||
}
|
}
|
||||||
@ -143,6 +169,8 @@ void Load(memgraph::coordination::DemoteMainToReplicaRes *self, memgraph::slk::R
|
|||||||
memgraph::slk::Load(&self->success, reader);
|
memgraph::slk::Load(&self->success, reader);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// UnregisterReplicaRpc
|
||||||
|
|
||||||
void Save(memgraph::coordination::UnregisterReplicaReq const &self, memgraph::slk::Builder *builder) {
|
void Save(memgraph::coordination::UnregisterReplicaReq const &self, memgraph::slk::Builder *builder) {
|
||||||
memgraph::slk::Save(self.instance_name, builder);
|
memgraph::slk::Save(self.instance_name, builder);
|
||||||
}
|
}
|
||||||
@ -167,6 +195,24 @@ void Load(memgraph::coordination::EnableWritingOnMainRes *self, memgraph::slk::R
|
|||||||
memgraph::slk::Load(&self->success, reader);
|
memgraph::slk::Load(&self->success, reader);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// GetInstanceUUIDRpc
|
||||||
|
|
||||||
|
void Save(const memgraph::coordination::GetInstanceUUIDReq & /*self*/, memgraph::slk::Builder * /*builder*/) {
|
||||||
|
/* nothing to serialize*/
|
||||||
|
}
|
||||||
|
|
||||||
|
void Load(memgraph::coordination::GetInstanceUUIDReq * /*self*/, memgraph::slk::Reader * /*reader*/) {
|
||||||
|
/* nothing to serialize*/
|
||||||
|
}
|
||||||
|
|
||||||
|
void Save(const memgraph::coordination::GetInstanceUUIDRes &self, memgraph::slk::Builder *builder) {
|
||||||
|
memgraph::slk::Save(self.uuid, builder);
|
||||||
|
}
|
||||||
|
|
||||||
|
void Load(memgraph::coordination::GetInstanceUUIDRes *self, memgraph::slk::Reader *reader) {
|
||||||
|
memgraph::slk::Load(&self->uuid, reader);
|
||||||
|
}
|
||||||
|
|
||||||
} // namespace slk
|
} // namespace slk
|
||||||
|
|
||||||
} // namespace memgraph
|
} // namespace memgraph
|
||||||
|
@ -15,6 +15,8 @@
|
|||||||
|
|
||||||
#include "coordination/coordinator_config.hpp"
|
#include "coordination/coordinator_config.hpp"
|
||||||
#include "rpc/client.hpp"
|
#include "rpc/client.hpp"
|
||||||
|
#include "rpc_errors.hpp"
|
||||||
|
#include "utils/result.hpp"
|
||||||
#include "utils/scheduler.hpp"
|
#include "utils/scheduler.hpp"
|
||||||
#include "utils/uuid.hpp"
|
#include "utils/uuid.hpp"
|
||||||
|
|
||||||
@ -46,7 +48,7 @@ class CoordinatorClient {
|
|||||||
auto SocketAddress() const -> std::string;
|
auto SocketAddress() const -> std::string;
|
||||||
|
|
||||||
[[nodiscard]] auto DemoteToReplica() const -> bool;
|
[[nodiscard]] auto DemoteToReplica() const -> bool;
|
||||||
// TODO: (andi) Consistent naming
|
|
||||||
auto SendPromoteReplicaToMainRpc(const utils::UUID &uuid, ReplicationClientsInfo replication_clients_info) const
|
auto SendPromoteReplicaToMainRpc(const utils::UUID &uuid, ReplicationClientsInfo replication_clients_info) const
|
||||||
-> bool;
|
-> bool;
|
||||||
|
|
||||||
@ -56,6 +58,8 @@ class CoordinatorClient {
|
|||||||
|
|
||||||
auto SendEnableWritingOnMainRpc() const -> bool;
|
auto SendEnableWritingOnMainRpc() const -> bool;
|
||||||
|
|
||||||
|
auto SendGetInstanceUUIDRpc() const -> memgraph::utils::BasicResult<GetInstanceUUIDError, std::optional<utils::UUID>>;
|
||||||
|
|
||||||
auto ReplicationClientInfo() const -> ReplClientInfo;
|
auto ReplicationClientInfo() const -> ReplClientInfo;
|
||||||
|
|
||||||
auto SetCallbacks(HealthCheckCallback succ_cb, HealthCheckCallback fail_cb) -> void;
|
auto SetCallbacks(HealthCheckCallback succ_cb, HealthCheckCallback fail_cb) -> void;
|
||||||
@ -64,6 +68,8 @@ class CoordinatorClient {
|
|||||||
|
|
||||||
auto InstanceDownTimeoutSec() const -> std::chrono::seconds;
|
auto InstanceDownTimeoutSec() const -> std::chrono::seconds;
|
||||||
|
|
||||||
|
auto InstanceGetUUIDFrequencySec() const -> std::chrono::seconds;
|
||||||
|
|
||||||
friend bool operator==(CoordinatorClient const &first, CoordinatorClient const &second) {
|
friend bool operator==(CoordinatorClient const &first, CoordinatorClient const &second) {
|
||||||
return first.config_ == second.config_;
|
return first.config_ == second.config_;
|
||||||
}
|
}
|
||||||
@ -71,7 +77,6 @@ class CoordinatorClient {
|
|||||||
private:
|
private:
|
||||||
utils::Scheduler instance_checker_;
|
utils::Scheduler instance_checker_;
|
||||||
|
|
||||||
// TODO: (andi) Pimpl?
|
|
||||||
communication::ClientContext rpc_context_;
|
communication::ClientContext rpc_context_;
|
||||||
mutable rpc::Client rpc_client_;
|
mutable rpc::Client rpc_client_;
|
||||||
|
|
||||||
|
@ -30,6 +30,7 @@ struct CoordinatorClientConfig {
|
|||||||
uint16_t port{};
|
uint16_t port{};
|
||||||
std::chrono::seconds instance_health_check_frequency_sec{1};
|
std::chrono::seconds instance_health_check_frequency_sec{1};
|
||||||
std::chrono::seconds instance_down_timeout_sec{5};
|
std::chrono::seconds instance_down_timeout_sec{5};
|
||||||
|
std::chrono::seconds instance_get_uuid_frequency_sec{10};
|
||||||
|
|
||||||
auto SocketAddress() const -> std::string { return ip_address + ":" + std::to_string(port); }
|
auto SocketAddress() const -> std::string { return ip_address + ":" + std::to_string(port); }
|
||||||
|
|
||||||
|
@ -38,6 +38,9 @@ class CoordinatorHandlers {
|
|||||||
slk::Builder *res_builder);
|
slk::Builder *res_builder);
|
||||||
static void EnableWritingOnMainHandler(replication::ReplicationHandler &replication_handler, slk::Reader *req_reader,
|
static void EnableWritingOnMainHandler(replication::ReplicationHandler &replication_handler, slk::Reader *req_reader,
|
||||||
slk::Builder *res_builder);
|
slk::Builder *res_builder);
|
||||||
|
|
||||||
|
static void GetInstanceUUIDHandler(replication::ReplicationHandler &replication_handler, slk::Reader *req_reader,
|
||||||
|
slk::Builder *res_builder);
|
||||||
};
|
};
|
||||||
|
|
||||||
} // namespace memgraph::dbms
|
} // namespace memgraph::dbms
|
||||||
|
@ -136,6 +136,31 @@ struct EnableWritingOnMainRes {
|
|||||||
|
|
||||||
using EnableWritingOnMainRpc = rpc::RequestResponse<EnableWritingOnMainReq, EnableWritingOnMainRes>;
|
using EnableWritingOnMainRpc = rpc::RequestResponse<EnableWritingOnMainReq, EnableWritingOnMainRes>;
|
||||||
|
|
||||||
|
struct GetInstanceUUIDReq {
|
||||||
|
static const utils::TypeInfo kType;
|
||||||
|
static const utils::TypeInfo &GetTypeInfo() { return kType; }
|
||||||
|
|
||||||
|
static void Load(GetInstanceUUIDReq *self, memgraph::slk::Reader *reader);
|
||||||
|
static void Save(const GetInstanceUUIDReq &self, memgraph::slk::Builder *builder);
|
||||||
|
|
||||||
|
GetInstanceUUIDReq() = default;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct GetInstanceUUIDRes {
|
||||||
|
static const utils::TypeInfo kType;
|
||||||
|
static const utils::TypeInfo &GetTypeInfo() { return kType; }
|
||||||
|
|
||||||
|
static void Load(GetInstanceUUIDRes *self, memgraph::slk::Reader *reader);
|
||||||
|
static void Save(const GetInstanceUUIDRes &self, memgraph::slk::Builder *builder);
|
||||||
|
|
||||||
|
explicit GetInstanceUUIDRes(std::optional<utils::UUID> uuid) : uuid(uuid) {}
|
||||||
|
GetInstanceUUIDRes() = default;
|
||||||
|
|
||||||
|
std::optional<utils::UUID> uuid;
|
||||||
|
};
|
||||||
|
|
||||||
|
using GetInstanceUUIDRpc = rpc::RequestResponse<GetInstanceUUIDReq, GetInstanceUUIDRes>;
|
||||||
|
|
||||||
} // namespace memgraph::coordination
|
} // namespace memgraph::coordination
|
||||||
|
|
||||||
// SLK serialization declarations
|
// SLK serialization declarations
|
||||||
@ -153,6 +178,11 @@ void Load(memgraph::coordination::DemoteMainToReplicaRes *self, memgraph::slk::R
|
|||||||
void Save(const memgraph::coordination::DemoteMainToReplicaReq &self, memgraph::slk::Builder *builder);
|
void Save(const memgraph::coordination::DemoteMainToReplicaReq &self, memgraph::slk::Builder *builder);
|
||||||
void Load(memgraph::coordination::DemoteMainToReplicaReq *self, memgraph::slk::Reader *reader);
|
void Load(memgraph::coordination::DemoteMainToReplicaReq *self, memgraph::slk::Reader *reader);
|
||||||
|
|
||||||
|
// GetInstanceUUIDRpc
|
||||||
|
void Save(const memgraph::coordination::GetInstanceUUIDReq &self, memgraph::slk::Builder *builder);
|
||||||
|
void Load(memgraph::coordination::GetInstanceUUIDReq *self, memgraph::slk::Reader *reader);
|
||||||
|
void Save(const memgraph::coordination::GetInstanceUUIDRes &self, memgraph::slk::Builder *builder);
|
||||||
|
void Load(memgraph::coordination::GetInstanceUUIDRes *self, memgraph::slk::Reader *reader);
|
||||||
// UnregisterReplicaRpc
|
// UnregisterReplicaRpc
|
||||||
void Save(memgraph::coordination::UnregisterReplicaRes const &self, memgraph::slk::Builder *builder);
|
void Save(memgraph::coordination::UnregisterReplicaRes const &self, memgraph::slk::Builder *builder);
|
||||||
void Load(memgraph::coordination::UnregisterReplicaRes *self, memgraph::slk::Reader *reader);
|
void Load(memgraph::coordination::UnregisterReplicaRes *self, memgraph::slk::Reader *reader);
|
||||||
|
@ -18,6 +18,7 @@
|
|||||||
#include "replication_coordination_glue/role.hpp"
|
#include "replication_coordination_glue/role.hpp"
|
||||||
|
|
||||||
#include <libnuraft/nuraft.hxx>
|
#include <libnuraft/nuraft.hxx>
|
||||||
|
#include "utils/result.hpp"
|
||||||
#include "utils/uuid.hpp"
|
#include "utils/uuid.hpp"
|
||||||
|
|
||||||
namespace memgraph::coordination {
|
namespace memgraph::coordination {
|
||||||
@ -37,6 +38,9 @@ class ReplicationInstance {
|
|||||||
|
|
||||||
auto OnSuccessPing() -> void;
|
auto OnSuccessPing() -> void;
|
||||||
auto OnFailPing() -> bool;
|
auto OnFailPing() -> bool;
|
||||||
|
auto IsReadyForUUIDPing() -> bool;
|
||||||
|
|
||||||
|
void UpdateReplicaLastResponseUUID();
|
||||||
|
|
||||||
auto IsAlive() const -> bool;
|
auto IsAlive() const -> bool;
|
||||||
|
|
||||||
@ -62,7 +66,8 @@ class ReplicationInstance {
|
|||||||
auto SendSwapAndUpdateUUID(const utils::UUID &new_main_uuid) -> bool;
|
auto SendSwapAndUpdateUUID(const utils::UUID &new_main_uuid) -> bool;
|
||||||
auto SendUnregisterReplicaRpc(std::string const &instance_name) -> bool;
|
auto SendUnregisterReplicaRpc(std::string const &instance_name) -> bool;
|
||||||
|
|
||||||
// TODO: (andi) Inconsistent API
|
|
||||||
|
auto SendGetInstanceUUID() -> utils::BasicResult<coordination::GetInstanceUUIDError, std::optional<utils::UUID>>;
|
||||||
auto GetClient() -> CoordinatorClient &;
|
auto GetClient() -> CoordinatorClient &;
|
||||||
|
|
||||||
auto EnableWritingOnMain() -> bool;
|
auto EnableWritingOnMain() -> bool;
|
||||||
@ -76,6 +81,7 @@ class ReplicationInstance {
|
|||||||
replication_coordination_glue::ReplicationRole replication_role_;
|
replication_coordination_glue::ReplicationRole replication_role_;
|
||||||
std::chrono::system_clock::time_point last_response_time_{};
|
std::chrono::system_clock::time_point last_response_time_{};
|
||||||
bool is_alive_{false};
|
bool is_alive_{false};
|
||||||
|
std::chrono::system_clock::time_point last_check_of_uuid_{};
|
||||||
|
|
||||||
// for replica this is main uuid of current main
|
// for replica this is main uuid of current main
|
||||||
// for "main" main this same as in CoordinatorData
|
// for "main" main this same as in CoordinatorData
|
||||||
|
14
src/coordination/include/coordination/rpc_errors.hpp
Normal file
14
src/coordination/include/coordination/rpc_errors.hpp
Normal file
@ -0,0 +1,14 @@
|
|||||||
|
// Copyright 2024 Memgraph Ltd.
|
||||||
|
//
|
||||||
|
// Use of this software is governed by the Business Source License
|
||||||
|
// included in the file licenses/BSL.txt; by using this file, you agree to be bound by the terms of the Business Source
|
||||||
|
// License, and you may not use this file except in compliance with the Business Source License.
|
||||||
|
//
|
||||||
|
// As of the Change Date specified in that file, in accordance with
|
||||||
|
// the Business Source License, use of this software will be governed
|
||||||
|
// by the Apache License, Version 2.0, included in the file
|
||||||
|
// licenses/APL.txt.
|
||||||
|
|
||||||
|
namespace memgraph::coordination {
|
||||||
|
enum class GetInstanceUUIDError { NO_RESPONSE, RPC_EXCEPTION };
|
||||||
|
} // namespace memgraph::coordination
|
@ -14,6 +14,7 @@
|
|||||||
#include "coordination/replication_instance.hpp"
|
#include "coordination/replication_instance.hpp"
|
||||||
|
|
||||||
#include "replication_coordination_glue/handler.hpp"
|
#include "replication_coordination_glue/handler.hpp"
|
||||||
|
#include "utils/result.hpp"
|
||||||
|
|
||||||
namespace memgraph::coordination {
|
namespace memgraph::coordination {
|
||||||
|
|
||||||
@ -39,6 +40,11 @@ auto ReplicationInstance::OnFailPing() -> bool {
|
|||||||
return is_alive_;
|
return is_alive_;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
auto ReplicationInstance::IsReadyForUUIDPing() -> bool {
|
||||||
|
return std::chrono::duration_cast<std::chrono::seconds>(std::chrono::system_clock::now() - last_check_of_uuid_) >
|
||||||
|
client_.InstanceGetUUIDFrequencySec();
|
||||||
|
}
|
||||||
|
|
||||||
auto ReplicationInstance::InstanceName() const -> std::string { return client_.InstanceName(); }
|
auto ReplicationInstance::InstanceName() const -> std::string { return client_.InstanceName(); }
|
||||||
auto ReplicationInstance::SocketAddress() const -> std::string { return client_.SocketAddress(); }
|
auto ReplicationInstance::SocketAddress() const -> std::string { return client_.SocketAddress(); }
|
||||||
auto ReplicationInstance::IsAlive() const -> bool { return is_alive_; }
|
auto ReplicationInstance::IsAlive() const -> bool { return is_alive_; }
|
||||||
@ -91,11 +97,21 @@ auto ReplicationInstance::ResetMainUUID() -> void { main_uuid_ = std::nullopt; }
|
|||||||
auto ReplicationInstance::GetMainUUID() const -> std::optional<utils::UUID> const & { return main_uuid_; }
|
auto ReplicationInstance::GetMainUUID() const -> std::optional<utils::UUID> const & { return main_uuid_; }
|
||||||
|
|
||||||
auto ReplicationInstance::EnsureReplicaHasCorrectMainUUID(utils::UUID const &curr_main_uuid) -> bool {
|
auto ReplicationInstance::EnsureReplicaHasCorrectMainUUID(utils::UUID const &curr_main_uuid) -> bool {
|
||||||
if (!main_uuid_ || *main_uuid_ != curr_main_uuid) {
|
if (!IsReadyForUUIDPing()) {
|
||||||
return SendSwapAndUpdateUUID(curr_main_uuid);
|
|
||||||
}
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
auto res = SendGetInstanceUUID();
|
||||||
|
if (res.HasError()) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
UpdateReplicaLastResponseUUID();
|
||||||
|
|
||||||
|
if (res.GetValue().has_value() && res.GetValue().value() == curr_main_uuid) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
return SendSwapAndUpdateUUID(curr_main_uuid);
|
||||||
|
}
|
||||||
|
|
||||||
auto ReplicationInstance::SendSwapAndUpdateUUID(const utils::UUID &new_main_uuid) -> bool {
|
auto ReplicationInstance::SendSwapAndUpdateUUID(const utils::UUID &new_main_uuid) -> bool {
|
||||||
if (!replication_coordination_glue::SendSwapMainUUIDRpc(client_.RpcClient(), new_main_uuid)) {
|
if (!replication_coordination_glue::SendSwapMainUUIDRpc(client_.RpcClient(), new_main_uuid)) {
|
||||||
@ -111,5 +127,12 @@ auto ReplicationInstance::SendUnregisterReplicaRpc(std::string const &instance_n
|
|||||||
|
|
||||||
auto ReplicationInstance::EnableWritingOnMain() -> bool { return client_.SendEnableWritingOnMainRpc(); }
|
auto ReplicationInstance::EnableWritingOnMain() -> bool { return client_.SendEnableWritingOnMainRpc(); }
|
||||||
|
|
||||||
|
auto ReplicationInstance::SendGetInstanceUUID()
|
||||||
|
-> utils::BasicResult<coordination::GetInstanceUUIDError, std::optional<utils::UUID>> {
|
||||||
|
return client_.SendGetInstanceUUIDRpc();
|
||||||
|
}
|
||||||
|
|
||||||
|
void ReplicationInstance::UpdateReplicaLastResponseUUID() { last_check_of_uuid_ = std::chrono::system_clock::now(); }
|
||||||
|
|
||||||
} // namespace memgraph::coordination
|
} // namespace memgraph::coordination
|
||||||
#endif
|
#endif
|
||||||
|
@ -22,6 +22,8 @@ DEFINE_uint32(raft_server_id, 0, "Unique ID of the raft server.");
|
|||||||
DEFINE_uint32(instance_down_timeout_sec, 5, "Time duration after which an instance is considered down.");
|
DEFINE_uint32(instance_down_timeout_sec, 5, "Time duration after which an instance is considered down.");
|
||||||
// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
|
// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
|
||||||
DEFINE_uint32(instance_health_check_frequency_sec, 1, "The time duration between two health checks/pings.");
|
DEFINE_uint32(instance_health_check_frequency_sec, 1, "The time duration between two health checks/pings.");
|
||||||
|
// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
|
||||||
|
DEFINE_uint32(instance_get_uuid_frequency_sec, 10, "The time duration between two instance uuid checks.");
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
|
// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
|
||||||
|
@ -24,6 +24,8 @@ DECLARE_uint32(raft_server_id);
|
|||||||
DECLARE_uint32(instance_down_timeout_sec);
|
DECLARE_uint32(instance_down_timeout_sec);
|
||||||
// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
|
// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
|
||||||
DECLARE_uint32(instance_health_check_frequency_sec);
|
DECLARE_uint32(instance_health_check_frequency_sec);
|
||||||
|
// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
|
||||||
|
DECLARE_uint32(instance_get_uuid_frequency_sec);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
|
// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
|
||||||
|
@ -359,6 +359,7 @@ int main(int argc, char **argv) {
|
|||||||
#ifdef MG_ENTERPRISE
|
#ifdef MG_ENTERPRISE
|
||||||
.instance_down_timeout_sec = std::chrono::seconds(FLAGS_instance_down_timeout_sec),
|
.instance_down_timeout_sec = std::chrono::seconds(FLAGS_instance_down_timeout_sec),
|
||||||
.instance_health_check_frequency_sec = std::chrono::seconds(FLAGS_instance_health_check_frequency_sec),
|
.instance_health_check_frequency_sec = std::chrono::seconds(FLAGS_instance_health_check_frequency_sec),
|
||||||
|
.instance_get_uuid_frequency_sec = std::chrono::seconds(FLAGS_instance_get_uuid_frequency_sec),
|
||||||
#endif
|
#endif
|
||||||
.default_kafka_bootstrap_servers = FLAGS_kafka_bootstrap_servers,
|
.default_kafka_bootstrap_servers = FLAGS_kafka_bootstrap_servers,
|
||||||
.default_pulsar_service_url = FLAGS_pulsar_service_url,
|
.default_pulsar_service_url = FLAGS_pulsar_service_url,
|
||||||
|
@ -24,6 +24,7 @@ struct InterpreterConfig {
|
|||||||
|
|
||||||
std::chrono::seconds instance_down_timeout_sec{5};
|
std::chrono::seconds instance_down_timeout_sec{5};
|
||||||
std::chrono::seconds instance_health_check_frequency_sec{1};
|
std::chrono::seconds instance_health_check_frequency_sec{1};
|
||||||
|
std::chrono::seconds instance_get_uuid_frequency_sec{10};
|
||||||
|
|
||||||
std::string default_kafka_bootstrap_servers;
|
std::string default_kafka_bootstrap_servers;
|
||||||
std::string default_pulsar_service_url;
|
std::string default_pulsar_service_url;
|
||||||
|
@ -329,7 +329,7 @@ class ReplQueryHandler {
|
|||||||
.port = static_cast<uint16_t>(*port),
|
.port = static_cast<uint16_t>(*port),
|
||||||
};
|
};
|
||||||
|
|
||||||
if (!handler_->SetReplicationRoleReplica(config, std::nullopt)) {
|
if (!handler_->TrySetReplicationRoleReplica(config, std::nullopt)) {
|
||||||
throw QueryRuntimeException("Couldn't set role to replica!");
|
throw QueryRuntimeException("Couldn't set role to replica!");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -486,8 +486,9 @@ class CoordQueryHandler final : public query::CoordinatorQueryHandler {
|
|||||||
void RegisterReplicationInstance(std::string const &coordinator_socket_address,
|
void RegisterReplicationInstance(std::string const &coordinator_socket_address,
|
||||||
std::string const &replication_socket_address,
|
std::string const &replication_socket_address,
|
||||||
std::chrono::seconds const &instance_check_frequency,
|
std::chrono::seconds const &instance_check_frequency,
|
||||||
std::chrono::seconds const &instance_down_timeout, std::string const &instance_name,
|
std::chrono::seconds const &instance_down_timeout,
|
||||||
CoordinatorQuery::SyncMode sync_mode) override {
|
std::chrono::seconds const &instance_get_uuid_frequency,
|
||||||
|
std::string const &instance_name, CoordinatorQuery::SyncMode sync_mode) override {
|
||||||
const auto maybe_replication_ip_port =
|
const auto maybe_replication_ip_port =
|
||||||
io::network::Endpoint::ParseSocketOrAddress(replication_socket_address, std::nullopt);
|
io::network::Endpoint::ParseSocketOrAddress(replication_socket_address, std::nullopt);
|
||||||
if (!maybe_replication_ip_port) {
|
if (!maybe_replication_ip_port) {
|
||||||
@ -514,6 +515,7 @@ class CoordQueryHandler final : public query::CoordinatorQueryHandler {
|
|||||||
.port = coordinator_server_port,
|
.port = coordinator_server_port,
|
||||||
.instance_health_check_frequency_sec = instance_check_frequency,
|
.instance_health_check_frequency_sec = instance_check_frequency,
|
||||||
.instance_down_timeout_sec = instance_down_timeout,
|
.instance_down_timeout_sec = instance_down_timeout,
|
||||||
|
.instance_get_uuid_frequency_sec = instance_get_uuid_frequency,
|
||||||
.replication_client_info = repl_config,
|
.replication_client_info = repl_config,
|
||||||
.ssl = std::nullopt};
|
.ssl = std::nullopt};
|
||||||
|
|
||||||
@ -1185,11 +1187,12 @@ Callback HandleCoordinatorQuery(CoordinatorQuery *coordinator_query, const Param
|
|||||||
instance_health_check_frequency_sec = config.instance_health_check_frequency_sec,
|
instance_health_check_frequency_sec = config.instance_health_check_frequency_sec,
|
||||||
instance_name = coordinator_query->instance_name_,
|
instance_name = coordinator_query->instance_name_,
|
||||||
instance_down_timeout_sec = config.instance_down_timeout_sec,
|
instance_down_timeout_sec = config.instance_down_timeout_sec,
|
||||||
|
instance_get_uuid_frequency_sec = config.instance_get_uuid_frequency_sec,
|
||||||
sync_mode = coordinator_query->sync_mode_]() mutable {
|
sync_mode = coordinator_query->sync_mode_]() mutable {
|
||||||
handler.RegisterReplicationInstance(std::string(coordinator_socket_address_tv.ValueString()),
|
handler.RegisterReplicationInstance(std::string(coordinator_socket_address_tv.ValueString()),
|
||||||
std::string(replication_socket_address_tv.ValueString()),
|
std::string(replication_socket_address_tv.ValueString()),
|
||||||
instance_health_check_frequency_sec, instance_down_timeout_sec,
|
instance_health_check_frequency_sec, instance_down_timeout_sec,
|
||||||
instance_name, sync_mode);
|
instance_get_uuid_frequency_sec, instance_name, sync_mode);
|
||||||
return std::vector<std::vector<TypedValue>>();
|
return std::vector<std::vector<TypedValue>>();
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -109,6 +109,7 @@ class CoordinatorQueryHandler {
|
|||||||
std::string const &replication_socket_address,
|
std::string const &replication_socket_address,
|
||||||
std::chrono::seconds const &instance_health_check_frequency,
|
std::chrono::seconds const &instance_health_check_frequency,
|
||||||
std::chrono::seconds const &instance_down_timeout,
|
std::chrono::seconds const &instance_down_timeout,
|
||||||
|
std::chrono::seconds const &instance_get_uuid_frequency,
|
||||||
std::string const &instance_name, CoordinatorQuery::SyncMode sync_mode) = 0;
|
std::string const &instance_name, CoordinatorQuery::SyncMode sync_mode) = 0;
|
||||||
|
|
||||||
/// @throw QueryRuntimeException if an error ocurred.
|
/// @throw QueryRuntimeException if an error ocurred.
|
||||||
|
@ -49,6 +49,9 @@ struct ReplicationQueryHandler {
|
|||||||
virtual bool SetReplicationRoleReplica(const memgraph::replication::ReplicationServerConfig &config,
|
virtual bool SetReplicationRoleReplica(const memgraph::replication::ReplicationServerConfig &config,
|
||||||
const std::optional<utils::UUID> &main_uuid) = 0;
|
const std::optional<utils::UUID> &main_uuid) = 0;
|
||||||
|
|
||||||
|
virtual bool TrySetReplicationRoleReplica(const memgraph::replication::ReplicationServerConfig &config,
|
||||||
|
const std::optional<utils::UUID> &main_uuid) = 0;
|
||||||
|
|
||||||
// as MAIN, define and connect to REPLICAs
|
// as MAIN, define and connect to REPLICAs
|
||||||
virtual auto TryRegisterReplica(const memgraph::replication::ReplicationClientConfig &config, bool send_swap_uuid)
|
virtual auto TryRegisterReplica(const memgraph::replication::ReplicationClientConfig &config, bool send_swap_uuid)
|
||||||
-> utils::BasicResult<RegisterReplicaError> = 0;
|
-> utils::BasicResult<RegisterReplicaError> = 0;
|
||||||
|
@ -62,8 +62,9 @@ ReplicationState::ReplicationState(std::optional<std::filesystem::path> durabili
|
|||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
if (std::holds_alternative<RoleReplicaData>(replication_data)) {
|
if (std::holds_alternative<RoleReplicaData>(replication_data)) {
|
||||||
spdlog::trace("Recovered main's uuid for replica {}",
|
auto &replica_uuid = std::get<RoleReplicaData>(replication_data).uuid_;
|
||||||
std::string(std::get<RoleReplicaData>(replication_data).uuid_.value()));
|
std::string uuid = replica_uuid.has_value() ? std::string(replica_uuid.value()) : "";
|
||||||
|
spdlog::trace("Recovered main's uuid for replica {}", uuid);
|
||||||
} else {
|
} else {
|
||||||
spdlog::trace("Recovered uuid for main {}", std::string(std::get<RoleMainData>(replication_data).uuid_));
|
spdlog::trace("Recovered uuid for main {}", std::string(std::get<RoleMainData>(replication_data).uuid_));
|
||||||
}
|
}
|
||||||
|
@ -14,6 +14,7 @@
|
|||||||
#include "dbms/dbms_handler.hpp"
|
#include "dbms/dbms_handler.hpp"
|
||||||
#include "flags/experimental.hpp"
|
#include "flags/experimental.hpp"
|
||||||
#include "replication/include/replication/state.hpp"
|
#include "replication/include/replication/state.hpp"
|
||||||
|
#include "replication_handler/system_replication.hpp"
|
||||||
#include "replication_handler/system_rpc.hpp"
|
#include "replication_handler/system_rpc.hpp"
|
||||||
#include "utils/result.hpp"
|
#include "utils/result.hpp"
|
||||||
|
|
||||||
@ -113,10 +114,14 @@ struct ReplicationHandler : public memgraph::query::ReplicationQueryHandler {
|
|||||||
// as REPLICA, become MAIN
|
// as REPLICA, become MAIN
|
||||||
bool SetReplicationRoleMain() override;
|
bool SetReplicationRoleMain() override;
|
||||||
|
|
||||||
// as MAIN, become REPLICA
|
// as MAIN, become REPLICA, can be called on MAIN and REPLICA
|
||||||
bool SetReplicationRoleReplica(const memgraph::replication::ReplicationServerConfig &config,
|
bool SetReplicationRoleReplica(const memgraph::replication::ReplicationServerConfig &config,
|
||||||
const std::optional<utils::UUID> &main_uuid) override;
|
const std::optional<utils::UUID> &main_uuid) override;
|
||||||
|
|
||||||
|
// as MAIN, become REPLICA, can be called only on MAIN
|
||||||
|
bool TrySetReplicationRoleReplica(const memgraph::replication::ReplicationServerConfig &config,
|
||||||
|
const std::optional<utils::UUID> &main_uuid) override;
|
||||||
|
|
||||||
// as MAIN, define and connect to REPLICAs
|
// as MAIN, define and connect to REPLICAs
|
||||||
auto TryRegisterReplica(const memgraph::replication::ReplicationClientConfig &config, bool send_swap_uuid)
|
auto TryRegisterReplica(const memgraph::replication::ReplicationClientConfig &config, bool send_swap_uuid)
|
||||||
-> memgraph::utils::BasicResult<memgraph::query::RegisterReplicaError> override;
|
-> memgraph::utils::BasicResult<memgraph::query::RegisterReplicaError> override;
|
||||||
@ -137,12 +142,13 @@ struct ReplicationHandler : public memgraph::query::ReplicationQueryHandler {
|
|||||||
auto GetReplState() const -> const memgraph::replication::ReplicationState &;
|
auto GetReplState() const -> const memgraph::replication::ReplicationState &;
|
||||||
auto GetReplState() -> memgraph::replication::ReplicationState &;
|
auto GetReplState() -> memgraph::replication::ReplicationState &;
|
||||||
|
|
||||||
|
auto GetReplicaUUID() -> std::optional<utils::UUID>;
|
||||||
|
|
||||||
private:
|
private:
|
||||||
template <bool AllowReplicaToDivergeFromMain>
|
template <bool AllowRPCFailure>
|
||||||
auto RegisterReplica_(const memgraph::replication::ReplicationClientConfig &config, bool send_swap_uuid)
|
auto RegisterReplica_(const memgraph::replication::ReplicationClientConfig &config, bool send_swap_uuid)
|
||||||
-> memgraph::utils::BasicResult<memgraph::query::RegisterReplicaError> {
|
-> memgraph::utils::BasicResult<memgraph::query::RegisterReplicaError> {
|
||||||
MG_ASSERT(repl_state_.IsMain(), "Only main instance can register a replica!");
|
MG_ASSERT(repl_state_.IsMain(), "Only main instance can register a replica!");
|
||||||
|
|
||||||
auto maybe_client = repl_state_.RegisterReplica(config);
|
auto maybe_client = repl_state_.RegisterReplica(config);
|
||||||
if (maybe_client.HasError()) {
|
if (maybe_client.HasError()) {
|
||||||
switch (maybe_client.GetError()) {
|
switch (maybe_client.GetError()) {
|
||||||
@ -159,7 +165,6 @@ struct ReplicationHandler : public memgraph::query::ReplicationQueryHandler {
|
|||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
using enum memgraph::flags::Experiments;
|
using enum memgraph::flags::Experiments;
|
||||||
bool system_replication_enabled = flags::AreExperimentsEnabled(SYSTEM_REPLICATION);
|
bool system_replication_enabled = flags::AreExperimentsEnabled(SYSTEM_REPLICATION);
|
||||||
if (!system_replication_enabled && dbms_handler_.Count() > 1) {
|
if (!system_replication_enabled && dbms_handler_.Count() > 1) {
|
||||||
@ -167,25 +172,21 @@ struct ReplicationHandler : public memgraph::query::ReplicationQueryHandler {
|
|||||||
}
|
}
|
||||||
const auto main_uuid =
|
const auto main_uuid =
|
||||||
std::get<memgraph::replication::RoleMainData>(dbms_handler_.ReplicationState().ReplicationData()).uuid_;
|
std::get<memgraph::replication::RoleMainData>(dbms_handler_.ReplicationState().ReplicationData()).uuid_;
|
||||||
|
|
||||||
if (send_swap_uuid) {
|
if (send_swap_uuid) {
|
||||||
if (!memgraph::replication_coordination_glue::SendSwapMainUUIDRpc(maybe_client.GetValue()->rpc_client_,
|
if (!memgraph::replication_coordination_glue::SendSwapMainUUIDRpc(maybe_client.GetValue()->rpc_client_,
|
||||||
main_uuid)) {
|
main_uuid)) {
|
||||||
return memgraph::query::RegisterReplicaError::ERROR_ACCEPTING_MAIN;
|
return memgraph::query::RegisterReplicaError::ERROR_ACCEPTING_MAIN;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef MG_ENTERPRISE
|
#ifdef MG_ENTERPRISE
|
||||||
// Update system before enabling individual storage <-> replica clients
|
// Update system before enabling individual storage <-> replica clients
|
||||||
SystemRestore(*maybe_client.GetValue(), system_, dbms_handler_, main_uuid, auth_);
|
SystemRestore(*maybe_client.GetValue(), system_, dbms_handler_, main_uuid, auth_);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
const auto dbms_error = HandleRegisterReplicaStatus(maybe_client);
|
const auto dbms_error = HandleRegisterReplicaStatus(maybe_client);
|
||||||
if (dbms_error.has_value()) {
|
if (dbms_error.has_value()) {
|
||||||
return *dbms_error;
|
return *dbms_error;
|
||||||
}
|
}
|
||||||
auto &instance_client_ptr = maybe_client.GetValue();
|
auto &instance_client_ptr = maybe_client.GetValue();
|
||||||
|
|
||||||
bool all_clients_good = true;
|
bool all_clients_good = true;
|
||||||
// Add database specific clients (NOTE Currently all databases are connected to each replica)
|
// Add database specific clients (NOTE Currently all databases are connected to each replica)
|
||||||
dbms_handler_.ForEach([&](dbms::DatabaseAccess db_acc) {
|
dbms_handler_.ForEach([&](dbms::DatabaseAccess db_acc) {
|
||||||
@ -195,7 +196,6 @@ struct ReplicationHandler : public memgraph::query::ReplicationQueryHandler {
|
|||||||
}
|
}
|
||||||
// TODO: ATM only IN_MEMORY_TRANSACTIONAL, fix other modes
|
// TODO: ATM only IN_MEMORY_TRANSACTIONAL, fix other modes
|
||||||
if (storage->storage_mode_ != storage::StorageMode::IN_MEMORY_TRANSACTIONAL) return;
|
if (storage->storage_mode_ != storage::StorageMode::IN_MEMORY_TRANSACTIONAL) return;
|
||||||
|
|
||||||
all_clients_good &= storage->repl_storage_state_.replication_clients_.WithLock(
|
all_clients_good &= storage->repl_storage_state_.replication_clients_.WithLock(
|
||||||
[storage, &instance_client_ptr, db_acc = std::move(db_acc),
|
[storage, &instance_client_ptr, db_acc = std::move(db_acc),
|
||||||
main_uuid](auto &storage_clients) mutable { // NOLINT
|
main_uuid](auto &storage_clients) mutable { // NOLINT
|
||||||
@ -203,9 +203,12 @@ struct ReplicationHandler : public memgraph::query::ReplicationQueryHandler {
|
|||||||
client->Start(storage, std::move(db_acc));
|
client->Start(storage, std::move(db_acc));
|
||||||
bool const success = std::invoke([state = client->State()]() {
|
bool const success = std::invoke([state = client->State()]() {
|
||||||
if (state == storage::replication::ReplicaState::DIVERGED_FROM_MAIN) {
|
if (state == storage::replication::ReplicaState::DIVERGED_FROM_MAIN) {
|
||||||
return AllowReplicaToDivergeFromMain;
|
return false;
|
||||||
}
|
}
|
||||||
return state != storage::replication::ReplicaState::MAYBE_BEHIND;
|
if (state == storage::replication::ReplicaState::MAYBE_BEHIND) {
|
||||||
|
return AllowRPCFailure;
|
||||||
|
}
|
||||||
|
return true;
|
||||||
});
|
});
|
||||||
|
|
||||||
if (success) {
|
if (success) {
|
||||||
@ -214,14 +217,12 @@ struct ReplicationHandler : public memgraph::query::ReplicationQueryHandler {
|
|||||||
return success;
|
return success;
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
// NOTE Currently if any databases fails, we revert back
|
// NOTE Currently if any databases fails, we revert back
|
||||||
if (!all_clients_good) {
|
if (!all_clients_good) {
|
||||||
spdlog::error("Failed to register all databases on the REPLICA \"{}\"", config.name);
|
spdlog::error("Failed to register all databases on the REPLICA \"{}\"", config.name);
|
||||||
UnregisterReplica(config.name);
|
UnregisterReplica(config.name);
|
||||||
return memgraph::query::RegisterReplicaError::CONNECTION_FAILED;
|
return memgraph::query::RegisterReplicaError::CONNECTION_FAILED;
|
||||||
}
|
}
|
||||||
|
|
||||||
// No client error, start instance level client
|
// No client error, start instance level client
|
||||||
#ifdef MG_ENTERPRISE
|
#ifdef MG_ENTERPRISE
|
||||||
StartReplicaClient(*instance_client_ptr, system_, dbms_handler_, main_uuid, auth_);
|
StartReplicaClient(*instance_client_ptr, system_, dbms_handler_, main_uuid, auth_);
|
||||||
@ -231,6 +232,57 @@ struct ReplicationHandler : public memgraph::query::ReplicationQueryHandler {
|
|||||||
return {};
|
return {};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template <bool AllowIdempotency>
|
||||||
|
bool SetReplicationRoleReplica_(const memgraph::replication::ReplicationServerConfig &config,
|
||||||
|
const std::optional<utils::UUID> &main_uuid) {
|
||||||
|
if (repl_state_.IsReplica()) {
|
||||||
|
if (!AllowIdempotency) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
// We don't want to restart the server if we're already a REPLICA with correct config
|
||||||
|
auto &replica_data = std::get<memgraph::replication::RoleReplicaData>(repl_state_.ReplicationData());
|
||||||
|
if (replica_data.config == config) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
repl_state_.SetReplicationRoleReplica(config, main_uuid);
|
||||||
|
#ifdef MG_ENTERPRISE
|
||||||
|
return StartRpcServer(dbms_handler_, replica_data, auth_, system_);
|
||||||
|
#else
|
||||||
|
return StartRpcServer(dbms_handler_, replica_data);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
// TODO StorageState needs to be synched. Could have a dangling reference if someone adds a database as we are
|
||||||
|
// deleting the replica.
|
||||||
|
// Remove database specific clients
|
||||||
|
dbms_handler_.ForEach([&](memgraph::dbms::DatabaseAccess db_acc) {
|
||||||
|
auto *storage = db_acc->storage();
|
||||||
|
storage->repl_storage_state_.replication_clients_.WithLock([](auto &clients) { clients.clear(); });
|
||||||
|
});
|
||||||
|
// Remove instance level clients
|
||||||
|
std::get<memgraph::replication::RoleMainData>(repl_state_.ReplicationData()).registered_replicas_.clear();
|
||||||
|
|
||||||
|
// Creates the server
|
||||||
|
repl_state_.SetReplicationRoleReplica(config, main_uuid);
|
||||||
|
|
||||||
|
// Start
|
||||||
|
const auto success =
|
||||||
|
std::visit(memgraph::utils::Overloaded{[](memgraph::replication::RoleMainData &) {
|
||||||
|
// ASSERT
|
||||||
|
return false;
|
||||||
|
},
|
||||||
|
[this](memgraph::replication::RoleReplicaData &data) {
|
||||||
|
#ifdef MG_ENTERPRISE
|
||||||
|
return StartRpcServer(dbms_handler_, data, auth_, system_);
|
||||||
|
#else
|
||||||
|
return StartRpcServer(dbms_handler_, data);
|
||||||
|
#endif
|
||||||
|
}},
|
||||||
|
repl_state_.ReplicationData());
|
||||||
|
// TODO Handle error (restore to main?)
|
||||||
|
return success;
|
||||||
|
}
|
||||||
|
|
||||||
memgraph::replication::ReplicationState &repl_state_;
|
memgraph::replication::ReplicationState &repl_state_;
|
||||||
memgraph::dbms::DbmsHandler &dbms_handler_;
|
memgraph::dbms::DbmsHandler &dbms_handler_;
|
||||||
|
|
||||||
|
@ -192,41 +192,12 @@ bool ReplicationHandler::SetReplicationRoleMain() {
|
|||||||
|
|
||||||
bool ReplicationHandler::SetReplicationRoleReplica(const memgraph::replication::ReplicationServerConfig &config,
|
bool ReplicationHandler::SetReplicationRoleReplica(const memgraph::replication::ReplicationServerConfig &config,
|
||||||
const std::optional<utils::UUID> &main_uuid) {
|
const std::optional<utils::UUID> &main_uuid) {
|
||||||
// We don't want to restart the server if we're already a REPLICA
|
return SetReplicationRoleReplica_<false>(config, main_uuid);
|
||||||
if (repl_state_.IsReplica()) {
|
|
||||||
spdlog::trace("Instance has already has replica role.");
|
|
||||||
return false;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO StorageState needs to be synched. Could have a dangling reference if someone adds a database as we are
|
bool ReplicationHandler::TrySetReplicationRoleReplica(const memgraph::replication::ReplicationServerConfig &config,
|
||||||
// deleting the replica.
|
const std::optional<utils::UUID> &main_uuid) {
|
||||||
// Remove database specific clients
|
return SetReplicationRoleReplica_<true>(config, main_uuid);
|
||||||
dbms_handler_.ForEach([&](memgraph::dbms::DatabaseAccess db_acc) {
|
|
||||||
auto *storage = db_acc->storage();
|
|
||||||
storage->repl_storage_state_.replication_clients_.WithLock([](auto &clients) { clients.clear(); });
|
|
||||||
});
|
|
||||||
// Remove instance level clients
|
|
||||||
std::get<memgraph::replication::RoleMainData>(repl_state_.ReplicationData()).registered_replicas_.clear();
|
|
||||||
|
|
||||||
// Creates the server
|
|
||||||
repl_state_.SetReplicationRoleReplica(config, main_uuid);
|
|
||||||
|
|
||||||
// Start
|
|
||||||
const auto success =
|
|
||||||
std::visit(memgraph::utils::Overloaded{[](memgraph::replication::RoleMainData &) {
|
|
||||||
// ASSERT
|
|
||||||
return false;
|
|
||||||
},
|
|
||||||
[this](memgraph::replication::RoleReplicaData &data) {
|
|
||||||
#ifdef MG_ENTERPRISE
|
|
||||||
return StartRpcServer(dbms_handler_, data, auth_, system_);
|
|
||||||
#else
|
|
||||||
return StartRpcServer(dbms_handler_, data);
|
|
||||||
#endif
|
|
||||||
}},
|
|
||||||
repl_state_.ReplicationData());
|
|
||||||
// TODO Handle error (restore to main?)
|
|
||||||
return success;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
bool ReplicationHandler::DoReplicaToMainPromotion(const utils::UUID &main_uuid) {
|
bool ReplicationHandler::DoReplicaToMainPromotion(const utils::UUID &main_uuid) {
|
||||||
@ -258,13 +229,13 @@ bool ReplicationHandler::DoReplicaToMainPromotion(const utils::UUID &main_uuid)
|
|||||||
auto ReplicationHandler::TryRegisterReplica(const memgraph::replication::ReplicationClientConfig &config,
|
auto ReplicationHandler::TryRegisterReplica(const memgraph::replication::ReplicationClientConfig &config,
|
||||||
bool send_swap_uuid)
|
bool send_swap_uuid)
|
||||||
-> memgraph::utils::BasicResult<memgraph::query::RegisterReplicaError> {
|
-> memgraph::utils::BasicResult<memgraph::query::RegisterReplicaError> {
|
||||||
return RegisterReplica_<false>(config, send_swap_uuid);
|
return RegisterReplica_<true>(config, send_swap_uuid);
|
||||||
}
|
}
|
||||||
|
|
||||||
auto ReplicationHandler::RegisterReplica(const memgraph::replication::ReplicationClientConfig &config,
|
auto ReplicationHandler::RegisterReplica(const memgraph::replication::ReplicationClientConfig &config,
|
||||||
bool send_swap_uuid)
|
bool send_swap_uuid)
|
||||||
-> memgraph::utils::BasicResult<memgraph::query::RegisterReplicaError> {
|
-> memgraph::utils::BasicResult<memgraph::query::RegisterReplicaError> {
|
||||||
return RegisterReplica_<true>(config, send_swap_uuid);
|
return RegisterReplica_<false>(config, send_swap_uuid);
|
||||||
}
|
}
|
||||||
|
|
||||||
auto ReplicationHandler::UnregisterReplica(std::string_view name) -> memgraph::query::UnregisterReplicaResult {
|
auto ReplicationHandler::UnregisterReplica(std::string_view name) -> memgraph::query::UnregisterReplicaResult {
|
||||||
@ -297,6 +268,11 @@ auto ReplicationHandler::GetRole() const -> memgraph::replication_coordination_g
|
|||||||
return repl_state_.GetRole();
|
return repl_state_.GetRole();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
auto ReplicationHandler::GetReplicaUUID() -> std::optional<utils::UUID> {
|
||||||
|
MG_ASSERT(repl_state_.IsReplica());
|
||||||
|
return std::get<RoleReplicaData>(repl_state_.ReplicationData()).uuid_;
|
||||||
|
}
|
||||||
|
|
||||||
auto ReplicationHandler::GetReplState() const -> const memgraph::replication::ReplicationState & { return repl_state_; }
|
auto ReplicationHandler::GetReplState() const -> const memgraph::replication::ReplicationState & { return repl_state_; }
|
||||||
|
|
||||||
auto ReplicationHandler::GetReplState() -> memgraph::replication::ReplicationState & { return repl_state_; }
|
auto ReplicationHandler::GetReplState() -> memgraph::replication::ReplicationState & { return repl_state_; }
|
||||||
|
@ -112,6 +112,9 @@ enum class TypeId : uint64_t {
|
|||||||
COORD_ENABLE_WRITING_ON_MAIN_REQ,
|
COORD_ENABLE_WRITING_ON_MAIN_REQ,
|
||||||
COORD_ENABLE_WRITING_ON_MAIN_RES,
|
COORD_ENABLE_WRITING_ON_MAIN_RES,
|
||||||
|
|
||||||
|
COORD_GET_UUID_REQ,
|
||||||
|
COORD_GET_UUID_RES,
|
||||||
|
|
||||||
// AST
|
// AST
|
||||||
AST_LABELIX = 3000,
|
AST_LABELIX = 3000,
|
||||||
AST_PROPERTYIX,
|
AST_PROPERTYIX,
|
||||||
|
@ -71,6 +71,7 @@ startup_config_dict = {
|
|||||||
"raft_server_id": ("0", "0", "Unique ID of the raft server."),
|
"raft_server_id": ("0", "0", "Unique ID of the raft server."),
|
||||||
"instance_down_timeout_sec": ("5", "5", "Time duration after which an instance is considered down."),
|
"instance_down_timeout_sec": ("5", "5", "Time duration after which an instance is considered down."),
|
||||||
"instance_health_check_frequency_sec": ("1", "1", "The time duration between two health checks/pings."),
|
"instance_health_check_frequency_sec": ("1", "1", "The time duration between two health checks/pings."),
|
||||||
|
"instance_get_uuid_frequency_sec": ("10", "10", "The time duration between two instance uuid checks."),
|
||||||
"data_directory": ("mg_data", "mg_data", "Path to directory in which to save all permanent data."),
|
"data_directory": ("mg_data", "mg_data", "Path to directory in which to save all permanent data."),
|
||||||
"data_recovery_on_startup": (
|
"data_recovery_on_startup": (
|
||||||
"false",
|
"false",
|
||||||
|
@ -10,11 +10,13 @@
|
|||||||
# licenses/APL.txt.
|
# licenses/APL.txt.
|
||||||
|
|
||||||
import os
|
import os
|
||||||
|
import shutil
|
||||||
import sys
|
import sys
|
||||||
|
import tempfile
|
||||||
|
|
||||||
import interactive_mg_runner
|
import interactive_mg_runner
|
||||||
import pytest
|
import pytest
|
||||||
from common import execute_and_fetch_all
|
from common import execute_and_fetch_all, safe_execute
|
||||||
from mg_utils import mg_sleep_and_assert
|
from mg_utils import mg_sleep_and_assert
|
||||||
|
|
||||||
interactive_mg_runner.SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__))
|
interactive_mg_runner.SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__))
|
||||||
@ -38,7 +40,7 @@ MEMGRAPH_FIRST_CLUSTER_DESCRIPTION = {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
MEMGRAPH_INSTANCES_DESCRIPTION = {
|
MEMGRAPH_SECOND_CLUSTER_DESCRIPTION = {
|
||||||
"replica": {
|
"replica": {
|
||||||
"args": ["--bolt-port", "7689", "--log-level", "TRACE"],
|
"args": ["--bolt-port", "7689", "--log-level", "TRACE"],
|
||||||
"log_file": "replica.log",
|
"log_file": "replica.log",
|
||||||
@ -71,7 +73,7 @@ def test_replication_works_on_failover(connection):
|
|||||||
assert actual_data_on_main == expected_data_on_main
|
assert actual_data_on_main == expected_data_on_main
|
||||||
|
|
||||||
# 3
|
# 3
|
||||||
interactive_mg_runner.start_all_keep_others(MEMGRAPH_INSTANCES_DESCRIPTION)
|
interactive_mg_runner.start_all_keep_others(MEMGRAPH_SECOND_CLUSTER_DESCRIPTION)
|
||||||
|
|
||||||
# 4
|
# 4
|
||||||
new_main_cursor = connection(7690, "main_2").cursor()
|
new_main_cursor = connection(7690, "main_2").cursor()
|
||||||
@ -113,5 +115,144 @@ def test_replication_works_on_failover(connection):
|
|||||||
interactive_mg_runner.stop_all()
|
interactive_mg_runner.stop_all()
|
||||||
|
|
||||||
|
|
||||||
|
def test_not_replicate_old_main_register_new_cluster(connection):
|
||||||
|
# Goal of this test is to check that although replica is registered in one cluster
|
||||||
|
# it can be re-registered to new cluster
|
||||||
|
# This flow checks if Registering replica is idempotent and that old main cannot talk to replica
|
||||||
|
# 1. We start all replicas and main in one cluster
|
||||||
|
# 2. Main from first cluster can see all replicas
|
||||||
|
# 3. We start all replicas and main in second cluster, by reusing one replica from first cluster
|
||||||
|
# 4. New main should see replica. Registration should pass (idempotent registration)
|
||||||
|
# 5. Old main should not talk to new replica
|
||||||
|
# 6. New main should talk to replica
|
||||||
|
|
||||||
|
TEMP_DIR = tempfile.TemporaryDirectory().name
|
||||||
|
MEMGRAPH_FISRT_COORD_CLUSTER_DESCRIPTION = {
|
||||||
|
"shared_instance": {
|
||||||
|
"args": [
|
||||||
|
"--bolt-port",
|
||||||
|
"7688",
|
||||||
|
"--log-level",
|
||||||
|
"TRACE",
|
||||||
|
"--coordinator-server-port",
|
||||||
|
"10011",
|
||||||
|
],
|
||||||
|
"log_file": "instance_1.log",
|
||||||
|
"data_directory": f"{TEMP_DIR}/shared_instance",
|
||||||
|
"setup_queries": [],
|
||||||
|
},
|
||||||
|
"instance_2": {
|
||||||
|
"args": [
|
||||||
|
"--bolt-port",
|
||||||
|
"7689",
|
||||||
|
"--log-level",
|
||||||
|
"TRACE",
|
||||||
|
"--coordinator-server-port",
|
||||||
|
"10012",
|
||||||
|
],
|
||||||
|
"log_file": "instance_2.log",
|
||||||
|
"data_directory": f"{TEMP_DIR}/instance_2",
|
||||||
|
"setup_queries": [],
|
||||||
|
},
|
||||||
|
"coordinator_1": {
|
||||||
|
"args": ["--bolt-port", "7690", "--log-level=TRACE", "--raft-server-id=1", "--raft-server-port=10111"],
|
||||||
|
"log_file": "coordinator.log",
|
||||||
|
"setup_queries": [
|
||||||
|
"REGISTER INSTANCE shared_instance ON '127.0.0.1:10011' WITH '127.0.0.1:10001';",
|
||||||
|
"REGISTER INSTANCE instance_2 ON '127.0.0.1:10012' WITH '127.0.0.1:10002';",
|
||||||
|
"SET INSTANCE instance_2 TO MAIN",
|
||||||
|
],
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
# 1
|
||||||
|
interactive_mg_runner.start_all_keep_others(MEMGRAPH_FISRT_COORD_CLUSTER_DESCRIPTION)
|
||||||
|
|
||||||
|
# 2
|
||||||
|
|
||||||
|
first_cluster_coord_cursor = connection(7690, "coord_1").cursor()
|
||||||
|
|
||||||
|
def show_repl_cluster():
|
||||||
|
return sorted(list(execute_and_fetch_all(first_cluster_coord_cursor, "SHOW INSTANCES;")))
|
||||||
|
|
||||||
|
expected_data_up_first_cluster = [
|
||||||
|
("coordinator_1", "127.0.0.1:10111", "", True, "coordinator"),
|
||||||
|
("instance_2", "", "127.0.0.1:10012", True, "main"),
|
||||||
|
("shared_instance", "", "127.0.0.1:10011", True, "replica"),
|
||||||
|
]
|
||||||
|
|
||||||
|
mg_sleep_and_assert(expected_data_up_first_cluster, show_repl_cluster)
|
||||||
|
|
||||||
|
# 3
|
||||||
|
|
||||||
|
MEMGRAPH_SECOND_COORD_CLUSTER_DESCRIPTION = {
|
||||||
|
"instance_3": {
|
||||||
|
"args": [
|
||||||
|
"--bolt-port",
|
||||||
|
"7687",
|
||||||
|
"--log-level",
|
||||||
|
"TRACE",
|
||||||
|
"--coordinator-server-port",
|
||||||
|
"10013",
|
||||||
|
],
|
||||||
|
"log_file": "instance_3.log",
|
||||||
|
"data_directory": f"{TEMP_DIR}/instance_3",
|
||||||
|
"setup_queries": [],
|
||||||
|
},
|
||||||
|
"coordinator_2": {
|
||||||
|
"args": ["--bolt-port", "7691", "--log-level=TRACE", "--raft-server-id=1", "--raft-server-port=10112"],
|
||||||
|
"log_file": "coordinator.log",
|
||||||
|
"setup_queries": [],
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
interactive_mg_runner.start_all_keep_others(MEMGRAPH_SECOND_COORD_CLUSTER_DESCRIPTION)
|
||||||
|
second_cluster_coord_cursor = connection(7691, "coord_2").cursor()
|
||||||
|
execute_and_fetch_all(
|
||||||
|
second_cluster_coord_cursor, "REGISTER INSTANCE shared_instance ON '127.0.0.1:10011' WITH '127.0.0.1:10001';"
|
||||||
|
)
|
||||||
|
execute_and_fetch_all(
|
||||||
|
second_cluster_coord_cursor, "REGISTER INSTANCE instance_3 ON '127.0.0.1:10013' WITH '127.0.0.1:10003';"
|
||||||
|
)
|
||||||
|
execute_and_fetch_all(second_cluster_coord_cursor, "SET INSTANCE instance_3 TO MAIN")
|
||||||
|
|
||||||
|
# 4
|
||||||
|
|
||||||
|
def show_repl_cluster():
|
||||||
|
return sorted(list(execute_and_fetch_all(second_cluster_coord_cursor, "SHOW INSTANCES;")))
|
||||||
|
|
||||||
|
expected_data_up_second_cluster = [
|
||||||
|
("coordinator_1", "127.0.0.1:10112", "", True, "coordinator"),
|
||||||
|
("instance_3", "", "127.0.0.1:10013", True, "main"),
|
||||||
|
("shared_instance", "", "127.0.0.1:10011", True, "replica"),
|
||||||
|
]
|
||||||
|
|
||||||
|
mg_sleep_and_assert(expected_data_up_second_cluster, show_repl_cluster)
|
||||||
|
|
||||||
|
# 5
|
||||||
|
main_1_cursor = connection(7689, "main_1").cursor()
|
||||||
|
with pytest.raises(Exception) as e:
|
||||||
|
execute_and_fetch_all(main_1_cursor, "CREATE ();")
|
||||||
|
assert (
|
||||||
|
str(e.value)
|
||||||
|
== "Replication Exception: At least one SYNC replica has not confirmed committing last transaction. Check the status of the replicas using 'SHOW REPLICAS' query."
|
||||||
|
)
|
||||||
|
|
||||||
|
shared_replica_cursor = connection(7688, "shared_replica").cursor()
|
||||||
|
res = execute_and_fetch_all(shared_replica_cursor, "MATCH (n) RETURN count(n);")[0][0]
|
||||||
|
assert res == 0, "Old main should not replicate to 'shared' replica"
|
||||||
|
|
||||||
|
# 6
|
||||||
|
main_2_cursor = connection(7687, "main_2").cursor()
|
||||||
|
|
||||||
|
execute_and_fetch_all(main_2_cursor, "CREATE ();")
|
||||||
|
|
||||||
|
shared_replica_cursor = connection(7688, "shared_replica").cursor()
|
||||||
|
res = execute_and_fetch_all(shared_replica_cursor, "MATCH (n) RETURN count(n);")[0][0]
|
||||||
|
assert res == 1, "New main should replicate to 'shared' replica"
|
||||||
|
|
||||||
|
interactive_mg_runner.stop_all()
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
sys.exit(pytest.main([__file__, "-rA"]))
|
sys.exit(pytest.main([__file__, "-rA"]))
|
||||||
|
@ -147,6 +147,105 @@ def test_replication_works_on_failover():
|
|||||||
interactive_mg_runner.stop_all(MEMGRAPH_INSTANCES_DESCRIPTION)
|
interactive_mg_runner.stop_all(MEMGRAPH_INSTANCES_DESCRIPTION)
|
||||||
|
|
||||||
|
|
||||||
|
def test_replication_works_on_replica_instance_restart():
|
||||||
|
# Goal of this test is to check the replication works after replica goes down and restarts
|
||||||
|
# 1. We start all replicas, main and coordinator manually: we want to be able to kill them ourselves without relying on external tooling to kill processes.
|
||||||
|
# 2. We check that main has correct state
|
||||||
|
# 3. We kill replica
|
||||||
|
# 4. We check that main cannot replicate to replica
|
||||||
|
# 5. We bring replica back up
|
||||||
|
# 6. We check that replica gets data
|
||||||
|
safe_execute(shutil.rmtree, TEMP_DIR)
|
||||||
|
|
||||||
|
# 1
|
||||||
|
interactive_mg_runner.start_all(MEMGRAPH_INSTANCES_DESCRIPTION)
|
||||||
|
|
||||||
|
# 2
|
||||||
|
main_cursor = connect(host="localhost", port=7687).cursor()
|
||||||
|
expected_data_on_main = [
|
||||||
|
("instance_1", "127.0.0.1:10001", "sync", 0, 0, "ready"),
|
||||||
|
("instance_2", "127.0.0.1:10002", "sync", 0, 0, "ready"),
|
||||||
|
]
|
||||||
|
actual_data_on_main = sorted(list(execute_and_fetch_all(main_cursor, "SHOW REPLICAS;")))
|
||||||
|
assert actual_data_on_main == expected_data_on_main
|
||||||
|
|
||||||
|
# 3
|
||||||
|
coord_cursor = connect(host="localhost", port=7690).cursor()
|
||||||
|
|
||||||
|
interactive_mg_runner.kill(MEMGRAPH_INSTANCES_DESCRIPTION, "instance_2")
|
||||||
|
|
||||||
|
def retrieve_data_show_repl_cluster():
|
||||||
|
return sorted(list(execute_and_fetch_all(coord_cursor, "SHOW INSTANCES;")))
|
||||||
|
|
||||||
|
expected_data_on_coord = [
|
||||||
|
("coordinator_1", "127.0.0.1:10111", "", True, "coordinator"),
|
||||||
|
("instance_1", "", "127.0.0.1:10011", True, "replica"),
|
||||||
|
("instance_2", "", "127.0.0.1:10012", False, "unknown"),
|
||||||
|
("instance_3", "", "127.0.0.1:10013", True, "main"),
|
||||||
|
]
|
||||||
|
mg_sleep_and_assert(expected_data_on_coord, retrieve_data_show_repl_cluster)
|
||||||
|
|
||||||
|
def retrieve_data_show_replicas():
|
||||||
|
return sorted(list(execute_and_fetch_all(main_cursor, "SHOW REPLICAS;")))
|
||||||
|
|
||||||
|
expected_data_on_main = [
|
||||||
|
("instance_1", "127.0.0.1:10001", "sync", 0, 0, "ready"),
|
||||||
|
("instance_2", "127.0.0.1:10002", "sync", 0, 0, "invalid"),
|
||||||
|
]
|
||||||
|
mg_sleep_and_assert(expected_data_on_main, retrieve_data_show_replicas)
|
||||||
|
|
||||||
|
# 4
|
||||||
|
instance_1_cursor = connect(host="localhost", port=7688).cursor()
|
||||||
|
with pytest.raises(Exception) as e:
|
||||||
|
execute_and_fetch_all(main_cursor, "CREATE ();")
|
||||||
|
assert (
|
||||||
|
str(e.value)
|
||||||
|
== "Replication Exception: At least one SYNC replica has not confirmed committing last transaction. Check the status of the replicas using 'SHOW REPLICAS' query."
|
||||||
|
)
|
||||||
|
|
||||||
|
res_instance_1 = execute_and_fetch_all(instance_1_cursor, "MATCH (n) RETURN count(n)")[0][0]
|
||||||
|
assert res_instance_1 == 1
|
||||||
|
|
||||||
|
def retrieve_data_show_replicas():
|
||||||
|
return sorted(list(execute_and_fetch_all(main_cursor, "SHOW REPLICAS;")))
|
||||||
|
|
||||||
|
expected_data_on_main = [
|
||||||
|
("instance_1", "127.0.0.1:10001", "sync", 2, 0, "ready"),
|
||||||
|
("instance_2", "127.0.0.1:10002", "sync", 0, 0, "invalid"),
|
||||||
|
]
|
||||||
|
mg_sleep_and_assert(expected_data_on_main, retrieve_data_show_replicas)
|
||||||
|
|
||||||
|
# 5.
|
||||||
|
|
||||||
|
interactive_mg_runner.start(MEMGRAPH_INSTANCES_DESCRIPTION, "instance_2")
|
||||||
|
|
||||||
|
def retrieve_data_show_repl_cluster():
|
||||||
|
return sorted(list(execute_and_fetch_all(coord_cursor, "SHOW INSTANCES;")))
|
||||||
|
|
||||||
|
expected_data_on_coord = [
|
||||||
|
("coordinator_1", "127.0.0.1:10111", "", True, "coordinator"),
|
||||||
|
("instance_1", "", "127.0.0.1:10011", True, "replica"),
|
||||||
|
("instance_2", "", "127.0.0.1:10012", True, "replica"),
|
||||||
|
("instance_3", "", "127.0.0.1:10013", True, "main"),
|
||||||
|
]
|
||||||
|
mg_sleep_and_assert(expected_data_on_coord, retrieve_data_show_repl_cluster)
|
||||||
|
|
||||||
|
def retrieve_data_show_replicas():
|
||||||
|
return sorted(list(execute_and_fetch_all(main_cursor, "SHOW REPLICAS;")))
|
||||||
|
|
||||||
|
expected_data_on_main = [
|
||||||
|
("instance_1", "127.0.0.1:10001", "sync", 2, 0, "ready"),
|
||||||
|
("instance_2", "127.0.0.1:10002", "sync", 2, 0, "ready"),
|
||||||
|
]
|
||||||
|
mg_sleep_and_assert(expected_data_on_main, retrieve_data_show_replicas)
|
||||||
|
|
||||||
|
# 6.
|
||||||
|
instance_2_cursor = connect(port=7689, host="localhost").cursor()
|
||||||
|
execute_and_fetch_all(main_cursor, "CREATE ();")
|
||||||
|
res_instance_2 = execute_and_fetch_all(instance_2_cursor, "MATCH (n) RETURN count(n)")[0][0]
|
||||||
|
assert res_instance_2 == 2
|
||||||
|
|
||||||
|
|
||||||
def test_show_instances():
|
def test_show_instances():
|
||||||
safe_execute(shutil.rmtree, TEMP_DIR)
|
safe_execute(shutil.rmtree, TEMP_DIR)
|
||||||
interactive_mg_runner.start_all(MEMGRAPH_INSTANCES_DESCRIPTION)
|
interactive_mg_runner.start_all(MEMGRAPH_INSTANCES_DESCRIPTION)
|
||||||
|
@ -142,7 +142,7 @@ TEST_F(ReplicationTest, BasicSynchronousReplicationTest) {
|
|||||||
MinMemgraph replica(repl_conf);
|
MinMemgraph replica(repl_conf);
|
||||||
|
|
||||||
auto replica_store_handler = replica.repl_handler;
|
auto replica_store_handler = replica.repl_handler;
|
||||||
replica_store_handler.SetReplicationRoleReplica(
|
replica_store_handler.TrySetReplicationRoleReplica(
|
||||||
ReplicationServerConfig{
|
ReplicationServerConfig{
|
||||||
.ip_address = local_host,
|
.ip_address = local_host,
|
||||||
.port = ports[0],
|
.port = ports[0],
|
||||||
@ -439,13 +439,13 @@ TEST_F(ReplicationTest, MultipleSynchronousReplicationTest) {
|
|||||||
MinMemgraph replica1(repl_conf);
|
MinMemgraph replica1(repl_conf);
|
||||||
MinMemgraph replica2(repl2_conf);
|
MinMemgraph replica2(repl2_conf);
|
||||||
|
|
||||||
replica1.repl_handler.SetReplicationRoleReplica(
|
replica1.repl_handler.TrySetReplicationRoleReplica(
|
||||||
ReplicationServerConfig{
|
ReplicationServerConfig{
|
||||||
.ip_address = local_host,
|
.ip_address = local_host,
|
||||||
.port = ports[0],
|
.port = ports[0],
|
||||||
},
|
},
|
||||||
std::nullopt);
|
std::nullopt);
|
||||||
replica2.repl_handler.SetReplicationRoleReplica(
|
replica2.repl_handler.TrySetReplicationRoleReplica(
|
||||||
ReplicationServerConfig{
|
ReplicationServerConfig{
|
||||||
.ip_address = local_host,
|
.ip_address = local_host,
|
||||||
.port = ports[1],
|
.port = ports[1],
|
||||||
@ -597,7 +597,7 @@ TEST_F(ReplicationTest, RecoveryProcess) {
|
|||||||
MinMemgraph replica(repl_conf);
|
MinMemgraph replica(repl_conf);
|
||||||
auto replica_store_handler = replica.repl_handler;
|
auto replica_store_handler = replica.repl_handler;
|
||||||
|
|
||||||
replica_store_handler.SetReplicationRoleReplica(
|
replica_store_handler.TrySetReplicationRoleReplica(
|
||||||
ReplicationServerConfig{
|
ReplicationServerConfig{
|
||||||
.ip_address = local_host,
|
.ip_address = local_host,
|
||||||
.port = ports[0],
|
.port = ports[0],
|
||||||
@ -676,7 +676,7 @@ TEST_F(ReplicationTest, BasicAsynchronousReplicationTest) {
|
|||||||
MinMemgraph replica_async(repl_conf);
|
MinMemgraph replica_async(repl_conf);
|
||||||
|
|
||||||
auto replica_store_handler = replica_async.repl_handler;
|
auto replica_store_handler = replica_async.repl_handler;
|
||||||
replica_store_handler.SetReplicationRoleReplica(
|
replica_store_handler.TrySetReplicationRoleReplica(
|
||||||
ReplicationServerConfig{
|
ReplicationServerConfig{
|
||||||
.ip_address = local_host,
|
.ip_address = local_host,
|
||||||
.port = ports[1],
|
.port = ports[1],
|
||||||
@ -726,7 +726,7 @@ TEST_F(ReplicationTest, EpochTest) {
|
|||||||
MinMemgraph main(main_conf);
|
MinMemgraph main(main_conf);
|
||||||
MinMemgraph replica1(repl_conf);
|
MinMemgraph replica1(repl_conf);
|
||||||
|
|
||||||
replica1.repl_handler.SetReplicationRoleReplica(
|
replica1.repl_handler.TrySetReplicationRoleReplica(
|
||||||
ReplicationServerConfig{
|
ReplicationServerConfig{
|
||||||
.ip_address = local_host,
|
.ip_address = local_host,
|
||||||
.port = ports[0],
|
.port = ports[0],
|
||||||
@ -734,7 +734,7 @@ TEST_F(ReplicationTest, EpochTest) {
|
|||||||
std::nullopt);
|
std::nullopt);
|
||||||
|
|
||||||
MinMemgraph replica2(repl2_conf);
|
MinMemgraph replica2(repl2_conf);
|
||||||
replica2.repl_handler.SetReplicationRoleReplica(
|
replica2.repl_handler.TrySetReplicationRoleReplica(
|
||||||
ReplicationServerConfig{
|
ReplicationServerConfig{
|
||||||
.ip_address = local_host,
|
.ip_address = local_host,
|
||||||
.port = 10001,
|
.port = 10001,
|
||||||
@ -819,7 +819,7 @@ TEST_F(ReplicationTest, EpochTest) {
|
|||||||
ASSERT_FALSE(acc->Commit().HasError());
|
ASSERT_FALSE(acc->Commit().HasError());
|
||||||
}
|
}
|
||||||
|
|
||||||
replica1.repl_handler.SetReplicationRoleReplica(
|
replica1.repl_handler.TrySetReplicationRoleReplica(
|
||||||
ReplicationServerConfig{
|
ReplicationServerConfig{
|
||||||
.ip_address = local_host,
|
.ip_address = local_host,
|
||||||
.port = ports[0],
|
.port = ports[0],
|
||||||
@ -858,7 +858,7 @@ TEST_F(ReplicationTest, ReplicationInformation) {
|
|||||||
MinMemgraph replica1(repl_conf);
|
MinMemgraph replica1(repl_conf);
|
||||||
|
|
||||||
uint16_t replica1_port = 10001;
|
uint16_t replica1_port = 10001;
|
||||||
replica1.repl_handler.SetReplicationRoleReplica(
|
replica1.repl_handler.TrySetReplicationRoleReplica(
|
||||||
ReplicationServerConfig{
|
ReplicationServerConfig{
|
||||||
.ip_address = local_host,
|
.ip_address = local_host,
|
||||||
.port = replica1_port,
|
.port = replica1_port,
|
||||||
@ -867,7 +867,7 @@ TEST_F(ReplicationTest, ReplicationInformation) {
|
|||||||
|
|
||||||
uint16_t replica2_port = 10002;
|
uint16_t replica2_port = 10002;
|
||||||
MinMemgraph replica2(repl2_conf);
|
MinMemgraph replica2(repl2_conf);
|
||||||
replica2.repl_handler.SetReplicationRoleReplica(
|
replica2.repl_handler.TrySetReplicationRoleReplica(
|
||||||
ReplicationServerConfig{
|
ReplicationServerConfig{
|
||||||
.ip_address = local_host,
|
.ip_address = local_host,
|
||||||
.port = replica2_port,
|
.port = replica2_port,
|
||||||
@ -923,7 +923,7 @@ TEST_F(ReplicationTest, ReplicationReplicaWithExistingName) {
|
|||||||
MinMemgraph replica1(repl_conf);
|
MinMemgraph replica1(repl_conf);
|
||||||
|
|
||||||
uint16_t replica1_port = 10001;
|
uint16_t replica1_port = 10001;
|
||||||
replica1.repl_handler.SetReplicationRoleReplica(
|
replica1.repl_handler.TrySetReplicationRoleReplica(
|
||||||
ReplicationServerConfig{
|
ReplicationServerConfig{
|
||||||
.ip_address = local_host,
|
.ip_address = local_host,
|
||||||
.port = replica1_port,
|
.port = replica1_port,
|
||||||
@ -932,7 +932,7 @@ TEST_F(ReplicationTest, ReplicationReplicaWithExistingName) {
|
|||||||
|
|
||||||
uint16_t replica2_port = 10002;
|
uint16_t replica2_port = 10002;
|
||||||
MinMemgraph replica2(repl2_conf);
|
MinMemgraph replica2(repl2_conf);
|
||||||
replica2.repl_handler.SetReplicationRoleReplica(
|
replica2.repl_handler.TrySetReplicationRoleReplica(
|
||||||
ReplicationServerConfig{
|
ReplicationServerConfig{
|
||||||
.ip_address = local_host,
|
.ip_address = local_host,
|
||||||
.port = replica2_port,
|
.port = replica2_port,
|
||||||
@ -966,7 +966,7 @@ TEST_F(ReplicationTest, ReplicationReplicaWithExistingEndPoint) {
|
|||||||
|
|
||||||
MinMemgraph main(main_conf);
|
MinMemgraph main(main_conf);
|
||||||
MinMemgraph replica1(repl_conf);
|
MinMemgraph replica1(repl_conf);
|
||||||
replica1.repl_handler.SetReplicationRoleReplica(
|
replica1.repl_handler.TrySetReplicationRoleReplica(
|
||||||
ReplicationServerConfig{
|
ReplicationServerConfig{
|
||||||
.ip_address = local_host,
|
.ip_address = local_host,
|
||||||
.port = common_port,
|
.port = common_port,
|
||||||
@ -974,7 +974,7 @@ TEST_F(ReplicationTest, ReplicationReplicaWithExistingEndPoint) {
|
|||||||
std::nullopt);
|
std::nullopt);
|
||||||
|
|
||||||
MinMemgraph replica2(repl2_conf);
|
MinMemgraph replica2(repl2_conf);
|
||||||
replica2.repl_handler.SetReplicationRoleReplica(
|
replica2.repl_handler.TrySetReplicationRoleReplica(
|
||||||
ReplicationServerConfig{
|
ReplicationServerConfig{
|
||||||
.ip_address = local_host,
|
.ip_address = local_host,
|
||||||
.port = common_port,
|
.port = common_port,
|
||||||
@ -1023,7 +1023,7 @@ TEST_F(ReplicationTest, RestoringReplicationAtStartupAfterDroppingReplica) {
|
|||||||
std::optional<MinMemgraph> main(main_config);
|
std::optional<MinMemgraph> main(main_config);
|
||||||
MinMemgraph replica1(replica1_config);
|
MinMemgraph replica1(replica1_config);
|
||||||
|
|
||||||
replica1.repl_handler.SetReplicationRoleReplica(
|
replica1.repl_handler.TrySetReplicationRoleReplica(
|
||||||
ReplicationServerConfig{
|
ReplicationServerConfig{
|
||||||
.ip_address = local_host,
|
.ip_address = local_host,
|
||||||
.port = ports[0],
|
.port = ports[0],
|
||||||
@ -1031,7 +1031,7 @@ TEST_F(ReplicationTest, RestoringReplicationAtStartupAfterDroppingReplica) {
|
|||||||
std::nullopt);
|
std::nullopt);
|
||||||
|
|
||||||
MinMemgraph replica2(replica2_config);
|
MinMemgraph replica2(replica2_config);
|
||||||
replica2.repl_handler.SetReplicationRoleReplica(
|
replica2.repl_handler.TrySetReplicationRoleReplica(
|
||||||
ReplicationServerConfig{
|
ReplicationServerConfig{
|
||||||
.ip_address = local_host,
|
.ip_address = local_host,
|
||||||
.port = ports[1],
|
.port = ports[1],
|
||||||
@ -1088,7 +1088,7 @@ TEST_F(ReplicationTest, RestoringReplicationAtStartup) {
|
|||||||
std::optional<MinMemgraph> main(main_config);
|
std::optional<MinMemgraph> main(main_config);
|
||||||
MinMemgraph replica1(repl_conf);
|
MinMemgraph replica1(repl_conf);
|
||||||
|
|
||||||
replica1.repl_handler.SetReplicationRoleReplica(
|
replica1.repl_handler.TrySetReplicationRoleReplica(
|
||||||
ReplicationServerConfig{
|
ReplicationServerConfig{
|
||||||
.ip_address = local_host,
|
.ip_address = local_host,
|
||||||
.port = ports[0],
|
.port = ports[0],
|
||||||
@ -1097,7 +1097,7 @@ TEST_F(ReplicationTest, RestoringReplicationAtStartup) {
|
|||||||
|
|
||||||
MinMemgraph replica2(repl2_conf);
|
MinMemgraph replica2(repl2_conf);
|
||||||
|
|
||||||
replica2.repl_handler.SetReplicationRoleReplica(
|
replica2.repl_handler.TrySetReplicationRoleReplica(
|
||||||
ReplicationServerConfig{
|
ReplicationServerConfig{
|
||||||
.ip_address = local_host,
|
.ip_address = local_host,
|
||||||
.port = ports[1],
|
.port = ports[1],
|
||||||
|
Loading…
Reference in New Issue
Block a user