Compare commits

...

35 Commits

Author SHA1 Message Date
Andi Skrgat
ccabbe7a63 Main restoration working 2024-01-29 14:15:26 +01:00
Andi Skrgat
aac169c5b3 Improve restoration process, WIP 2024-01-29 10:36:30 +01:00
Andi Skrgat
d0cb85e642 Support main restart 2024-01-26 12:12:34 +01:00
Andi Skrgat
34a7fed59a Tests for main restoration 2024-01-26 11:45:41 +01:00
Andi Skrgat
8124226ba9 Replica instance restored 2024-01-26 11:23:38 +01:00
antoniofilipovic
eea21dd73e start restorations 2024-01-26 09:34:43 +01:00
Andi Skrgat
a6483fc6a7 No role when instance down 2024-01-26 08:59:28 +01:00
Andi
45def388c7
Merge branch 'master' into automatic-failover 2024-01-25 21:37:01 +01:00
Andi Skrgat
da3db9f3bc Fix coordinator test 2024-01-25 16:02:17 +01:00
Andi Skrgat
34b2e360a9 fixup! Improve concurrency model 2024-01-25 15:33:33 +01:00
Andi Skrgat
9bc0d9425e Improve concurrency model 2024-01-25 15:20:25 +01:00
antoniofilipovic
62d1b68c2f Merge branch 'automatic-failover' of github.com:memgraph/memgraph into automatic-failover 2024-01-25 14:00:43 +01:00
antoniofilipovic
ec46cca7a3 fix issues 2024-01-25 14:00:29 +01:00
antoniofilipovic
326a04c6a3 fix failover to main 2024-01-25 12:08:01 +01:00
Andi Skrgat
b21f9e71ed fixup! fixup! Fix build 2024-01-25 11:12:34 +01:00
Andi Skrgat
c2e0b310f5 fixup! Fix build 2024-01-25 09:54:00 +01:00
Andi Skrgat
b492c7d34f Fix build 2024-01-25 09:38:11 +01:00
Andi Skrgat
303608982a Remove cluster_initialized and client_initiated failover tests 2024-01-25 09:38:11 +01:00
Andi Skrgat
c0f979fdcb Forbid spurious fail from new main during failover 2024-01-25 09:38:11 +01:00
antoniofilipovic
5242427686 merge with Andi's work, register instance works 2024-01-25 09:38:11 +01:00
antoniofilipovic
d3168ded5a merge with Andi's work - comment out problematic parts 2024-01-25 09:38:11 +01:00
antoniofilipovic
0a28fee34b add register instance and set instance to main queries and logic 2024-01-25 09:38:11 +01:00
Gareth Lloyd
dc9a2c45c4 Changes from Gareth to make build + comments 2024-01-25 09:38:11 +01:00
Andi Skrgat
1133bb8ecb Idempotent failover from coordinator 2024-01-25 09:38:11 +01:00
Andi Skrgat
e9c5cc3b82 Remove ShowMainReplica status 2024-01-25 09:38:11 +01:00
Andi Skrgat
3f4ac0dd58 Thread-safe AF working 2024-01-25 09:38:10 +01:00
Andi Skrgat
a0ecea7d1c Add generic CoordinatorInstance 2024-01-25 09:38:10 +01:00
Andi Skrgat
567e1fa1cb CoordinatorState -> CoordinatorData 2024-01-25 09:38:10 +01:00
Andi Skrgat
ef37c44149 Thread-unsafe automatic failover 2024-01-25 09:38:10 +01:00
Andi Skrgat
ab34b060c0 Thread-safe access to coordinator data 2024-01-25 09:38:10 +01:00
Andi Skrgat
67c1874e81 AF callbacks 2024-01-25 09:38:10 +01:00
Andi Skrgat
9d457eafa8 AF thread issue 2024-01-25 09:38:10 +01:00
Andi Skrgat
986ea37ead Refactoring of coord state 2024-01-25 09:38:10 +01:00
Andi Skrgat
afe7d47a5c Improve coordinator handler 2024-01-25 09:38:10 +01:00
Andi Skrgat
8884a0ea78 Rename replica_check_freq to instance_check_freq 2024-01-25 09:38:10 +01:00
115 changed files with 1545 additions and 1284 deletions

View File

@ -96,7 +96,7 @@ jobs:
- name: Python code analysis
run: |
CHANGED_FILES=$(git diff -U0 ${{ env.BASE_BRANCH }}... --name-only)
CHANGED_FILES=$(git diff -U0 ${{ env.BASE_BRANCH }}... --name-only --diff-filter=d)
for file in ${CHANGED_FILES}; do
echo ${file}
if [[ ${file} == *.py ]]; then

View File

@ -7,19 +7,24 @@ target_sources(mg-coordination
include/coordination/coordinator_rpc.hpp
include/coordination/coordinator_server.hpp
include/coordination/coordinator_config.hpp
include/coordination/coordinator_entity_info.hpp
include/coordination/coordinator_exceptions.hpp
include/coordination/coordinator_instance.hpp
include/coordination/coordinator_slk.hpp
include/coordination/coordinator_data.hpp
include/coordination/constants.hpp
include/coordination/failover_status.hpp
include/coordination/coordinator_cluster_config.hpp
PRIVATE
coordinator_client.cpp
coordinator_state.cpp
coordinator_rpc.cpp
coordinator_server.cpp
coordinator_data.cpp
coordinator_instance.cpp
)
target_include_directories(mg-coordination PUBLIC include)
target_link_libraries(mg-coordination
PUBLIC mg::utils mg::rpc mg::slk mg::io mg::repl_coord_glue
PUBLIC mg::utils mg::rpc mg::slk mg::io mg::repl_coord_glue lib::rangev3
)

View File

@ -27,84 +27,77 @@ auto CreateClientContext(const memgraph::coordination::CoordinatorClientConfig &
}
} // namespace
CoordinatorClient::CoordinatorClient(const CoordinatorClientConfig &config)
CoordinatorClient::CoordinatorClient(CoordinatorData *coord_data, CoordinatorClientConfig config,
HealthCheckCallback succ_cb, HealthCheckCallback fail_cb)
: rpc_context_{CreateClientContext(config)},
rpc_client_{io::network::Endpoint(io::network::Endpoint::needs_resolving, config.ip_address, config.port),
&rpc_context_},
config_{config} {}
config_{std::move(config)},
coord_data_{coord_data},
succ_cb_{std::move(succ_cb)},
fail_cb_{std::move(fail_cb)} {}
CoordinatorClient::~CoordinatorClient() {
auto exit_job = utils::OnScopeExit([&] {
StopFrequentCheck();
thread_pool_.Shutdown();
});
const auto endpoint = rpc_client_.Endpoint();
// Logging can throw
spdlog::trace("Closing replication client on {}:{}", endpoint.address, endpoint.port);
}
auto CoordinatorClient::InstanceName() const -> std::string { return config_.instance_name; }
auto CoordinatorClient::SocketAddress() const -> std::string { return rpc_client_.Endpoint().SocketAddress(); }
void CoordinatorClient::StartFrequentCheck() {
MG_ASSERT(config_.health_check_frequency_sec > std::chrono::seconds(0),
"Health check frequency must be greater than 0");
replica_checker_.Run(
"Coord checker", config_.health_check_frequency_sec,
[last_response_time = &last_response_time_, rpc_client = &rpc_client_] {
instance_checker_.Run(
config_.instance_name, config_.health_check_frequency_sec, [this, instance_name = config_.instance_name] {
try {
{
auto stream{rpc_client->Stream<memgraph::replication_coordination_glue::FrequentHeartbeatRpc>()};
stream.AwaitResponse();
last_response_time->store(std::chrono::system_clock::now(), std::memory_order_acq_rel);
}
spdlog::trace("Sending frequent heartbeat to machine {} on {}", instance_name,
rpc_client_.Endpoint().SocketAddress());
auto stream{rpc_client_.Stream<memgraph::replication_coordination_glue::FrequentHeartbeatRpc>()};
stream.AwaitResponse();
succ_cb_(coord_data_, instance_name);
} catch (const rpc::RpcFailedException &) {
// Nothing to do...wait for a reconnect
fail_cb_(coord_data_, instance_name);
}
});
}
void CoordinatorClient::StopFrequentCheck() { replica_checker_.Stop(); }
void CoordinatorClient::StopFrequentCheck() { instance_checker_.Stop(); }
void CoordinatorClient::PauseFrequentCheck() { instance_checker_.Pause(); }
void CoordinatorClient::ResumeFrequentCheck() { instance_checker_.Resume(); }
bool CoordinatorClient::DoHealthCheck() const {
auto current_time = std::chrono::system_clock::now();
auto duration = std::chrono::duration_cast<std::chrono::seconds>(current_time -
last_response_time_.load(std::memory_order_acquire));
return duration.count() <= alive_response_time_difference_sec_;
auto CoordinatorClient::SetCallbacks(HealthCheckCallback succ_cb, HealthCheckCallback fail_cb) -> void {
succ_cb_ = std::move(succ_cb);
fail_cb_ = std::move(fail_cb);
}
auto CoordinatorClient::InstanceName() const -> std::string_view { return config_.instance_name; }
auto CoordinatorClient::Endpoint() const -> io::network::Endpoint const & { return rpc_client_.Endpoint(); }
auto CoordinatorClient::Config() const -> CoordinatorClientConfig const & { return config_; }
auto CoordinatorClient::ReplicationClientInfo() const -> CoordinatorClientConfig::ReplicationClientInfo const & {
MG_ASSERT(config_.replication_client_info.has_value(), "No ReplicationClientInfo for MAIN instance!");
return *config_.replication_client_info;
}
////// AF design choice
auto CoordinatorClient::ReplicationClientInfo() -> std::optional<CoordinatorClientConfig::ReplicationClientInfo> & {
MG_ASSERT(config_.replication_client_info.has_value(), "No ReplicationClientInfo for MAIN instance!");
auto CoordinatorClient::ReplicationClientInfo() const -> CoordinatorClientConfig::ReplicationClientInfo {
return config_.replication_client_info;
}
void CoordinatorClient::UpdateTimeCheck(const std::chrono::system_clock::time_point &last_checked_time) {
last_response_time_.store(last_checked_time, std::memory_order_acq_rel);
}
auto CoordinatorClient::GetLastTimeResponse() -> std::chrono::system_clock::time_point { return last_response_time_; }
auto CoordinatorClient::SendPromoteReplicaToMainRpc(
std::vector<CoordinatorClientConfig::ReplicationClientInfo> replication_clients_info) const -> bool {
try {
{
auto stream{rpc_client_.Stream<PromoteReplicaToMainRpc>(std::move(replication_clients_info))};
if (!stream.AwaitResponse().success) {
spdlog::error("Failed to perform failover!");
return false;
}
spdlog::info("Sent failover RPC from coordinator to new main!");
return true;
auto stream{rpc_client_.Stream<PromoteReplicaToMainRpc>(std::move(replication_clients_info))};
if (!stream.AwaitResponse().success) {
spdlog::error("Failed to receive successful RPC failover response!");
return false;
}
return true;
} catch (const rpc::RpcFailedException &) {
spdlog::error("Failed to send failover RPC from coordinator to new main!");
spdlog::error("RPC error occurred while sending failover RPC!");
}
return false;
}
auto CoordinatorClient::DemoteToReplica() const -> bool {
const auto instance_name = config_.instance_name;
try {
auto stream{rpc_client_.Stream<SetMainToReplicaRpc>(config_.replication_client_info)};
if (!stream.AwaitResponse().success) {
spdlog::error("Failed to receive successful RPC response for setting instance {} to replica!", instance_name);
return false;
}
spdlog::info("Sent request RPC from coordinator to instance to set it as replica!");
return true;
} catch (const rpc::RpcFailedException &) {
spdlog::error("Failed to set instance {} to replica!", instance_name);
}
return false;
}

View File

@ -0,0 +1,224 @@
// Copyright 2024 Memgraph Ltd.
//
// Use of this software is governed by the Business Source License
// included in the file licenses/BSL.txt; by using this file, you agree to be bound by the terms of the Business Source
// License, and you may not use this file except in compliance with the Business Source License.
//
// As of the Change Date specified in that file, in accordance with
// the Business Source License, use of this software will be governed
// by the Apache License, Version 2.0, included in the file
// licenses/APL.txt.
#include "coordination/coordinator_instance.hpp"
#include "coordination/register_main_replica_coordinator_status.hpp"
#ifdef MG_ENTERPRISE
#include "coordination/coordinator_data.hpp"
#include <range/v3/view.hpp>
#include <shared_mutex>
namespace memgraph::coordination {
CoordinatorData::CoordinatorData() {
auto find_instance = [](CoordinatorData *coord_data, std::string_view instance_name) -> CoordinatorInstance & {
auto instance = std::ranges::find_if(
coord_data->registered_instances_,
[instance_name](const CoordinatorInstance &instance) { return instance.InstanceName() == instance_name; });
MG_ASSERT(instance != coord_data->registered_instances_.end(), "Instance {} not found during callback!",
instance_name);
return *instance;
};
replica_succ_cb_ = [find_instance](CoordinatorData *coord_data, std::string_view instance_name) -> void {
auto lock = std::lock_guard{coord_data->coord_data_lock_};
spdlog::trace("Instance {} performing replica successful callback", instance_name);
auto &instance = find_instance(coord_data, instance_name);
instance.OnSuccessPing();
};
replica_fail_cb_ = [find_instance](CoordinatorData *coord_data, std::string_view instance_name) -> void {
auto lock = std::lock_guard{coord_data->coord_data_lock_};
spdlog::trace("Instance {} performing replica failure callback", instance_name);
auto &instance = find_instance(coord_data, instance_name);
instance.OnFailPing();
};
main_succ_cb_ = [this, find_instance](CoordinatorData *coord_data, std::string_view instance_name) -> void {
auto lock = std::lock_guard{coord_data->coord_data_lock_};
spdlog::trace("Instance {} performing main successful callback", instance_name);
auto &instance = find_instance(coord_data, instance_name);
if (instance.IsAlive()) {
instance.OnSuccessPing();
} else {
auto const new_role = coord_data->ClusterHasAliveMain() ? replication_coordination_glue::ReplicationRole::REPLICA
: replication_coordination_glue::ReplicationRole::MAIN;
if (new_role == replication_coordination_glue::ReplicationRole::REPLICA) {
thread_pool_.AddTask([&instance, coord_data, instance_name]() {
auto lock = std::lock_guard{coord_data->coord_data_lock_};
spdlog::info("Demoting instance {} to replica", instance_name);
instance.PauseFrequentCheck();
utils::OnScopeExit scope_exit{[&instance] { instance.ResumeFrequentCheck(); }};
auto const status = instance.DemoteToReplica(coord_data->replica_succ_cb_, coord_data->replica_fail_cb_);
if (!status) {
spdlog::error("Instance {} failed to demote to replica", instance_name);
} else {
spdlog::info("Instance {} demoted to replica", instance_name);
instance.OnSuccessPing();
}
});
} else {
instance.OnSuccessPing();
}
}
};
main_fail_cb_ = [this, find_instance](CoordinatorData *coord_data, std::string_view instance_name) -> void {
auto lock = std::lock_guard{coord_data->coord_data_lock_};
spdlog::trace("Instance {} performing main failure callback", instance_name);
auto &instance = find_instance(coord_data, instance_name);
instance.OnFailPing();
if (!ClusterHasAliveMain()) {
spdlog::info("Cluster without main instance, starting automatic failover");
switch (auto failover_status = DoFailover(); failover_status) {
using enum DoFailoverStatus;
case ALL_REPLICAS_DOWN:
spdlog::warn("Failover aborted since all replicas are down!");
break;
case MAIN_ALIVE:
spdlog::warn("Failover aborted since main is alive!");
break;
case RPC_FAILED:
spdlog::warn("Failover aborted since promoting replica to main failed!");
break;
case SUCCESS:
break;
}
}
};
}
auto CoordinatorData::ClusterHasAliveMain() const -> bool {
auto const alive_main = [](const CoordinatorInstance &instance) { return instance.IsMain() && instance.IsAlive(); };
return std::ranges::any_of(registered_instances_, alive_main);
}
auto CoordinatorData::DoFailover() -> DoFailoverStatus {
auto replica_instances = registered_instances_ | ranges::views::filter(&CoordinatorInstance::IsReplica);
auto chosen_replica_instance = std::ranges::find_if(replica_instances, &CoordinatorInstance::IsAlive);
if (chosen_replica_instance == replica_instances.end()) {
return DoFailoverStatus::ALL_REPLICAS_DOWN;
}
chosen_replica_instance->PauseFrequentCheck();
utils::OnScopeExit scope_exit{[&chosen_replica_instance] { chosen_replica_instance->ResumeFrequentCheck(); }};
std::vector<ReplClientInfo> repl_clients_info;
repl_clients_info.reserve(std::ranges::distance(replica_instances));
auto const not_chosen_replica_instance = [&chosen_replica_instance](const CoordinatorInstance &instance) {
return instance != *chosen_replica_instance;
};
std::ranges::transform(replica_instances | ranges::views::filter(not_chosen_replica_instance),
std::back_inserter(repl_clients_info),
[](const CoordinatorInstance &instance) { return instance.ReplicationClientInfo(); });
if (!chosen_replica_instance->PromoteToMain(std::move(repl_clients_info), main_succ_cb_, main_fail_cb_)) {
return DoFailoverStatus::RPC_FAILED;
}
return DoFailoverStatus::SUCCESS;
}
auto CoordinatorData::ShowInstances() const -> std::vector<CoordinatorInstanceStatus> {
std::vector<CoordinatorInstanceStatus> instances_status;
instances_status.reserve(registered_instances_.size());
auto const stringify_repl_role = [](const CoordinatorInstance &instance) -> std::string {
if (!instance.IsAlive()) return "unknown";
if (instance.IsMain()) return "main";
return "replica";
};
auto const instance_to_status =
[&stringify_repl_role](const CoordinatorInstance &instance) -> CoordinatorInstanceStatus {
return {.instance_name = instance.InstanceName(),
.socket_address = instance.SocketAddress(),
.replication_role = stringify_repl_role(instance),
.is_alive = instance.IsAlive()};
};
{
auto lock = std::shared_lock{coord_data_lock_};
std::ranges::transform(registered_instances_, std::back_inserter(instances_status), instance_to_status);
}
return instances_status;
}
auto CoordinatorData::SetInstanceToMain(std::string instance_name) -> SetInstanceToMainCoordinatorStatus {
auto lock = std::lock_guard{coord_data_lock_};
auto const is_new_main = [&instance_name](const CoordinatorInstance &instance) {
return instance.InstanceName() == instance_name;
};
auto new_main = std::ranges::find_if(registered_instances_, is_new_main);
if (new_main == registered_instances_.end()) {
spdlog::error("Instance {} not registered. Please register it using REGISTER INSTANCE {}", instance_name,
instance_name);
return SetInstanceToMainCoordinatorStatus::NO_INSTANCE_WITH_NAME;
}
new_main->PauseFrequentCheck();
utils::OnScopeExit scope_exit{[&new_main] { new_main->ResumeFrequentCheck(); }};
std::vector<CoordinatorClientConfig::ReplicationClientInfo> repl_clients_info;
repl_clients_info.reserve(registered_instances_.size() - 1);
auto const is_not_new_main = [&instance_name](const CoordinatorInstance &instance) {
return instance.InstanceName() != instance_name;
};
std::ranges::transform(registered_instances_ | ranges::views::filter(is_not_new_main),
std::back_inserter(repl_clients_info),
[](const CoordinatorInstance &instance) { return instance.ReplicationClientInfo(); });
if (!new_main->PromoteToMain(std::move(repl_clients_info), main_succ_cb_, main_fail_cb_)) {
return SetInstanceToMainCoordinatorStatus::COULD_NOT_PROMOTE_TO_MAIN;
}
spdlog::info("Instance {} promoted to main", instance_name);
return SetInstanceToMainCoordinatorStatus::SUCCESS;
}
auto CoordinatorData::RegisterInstance(CoordinatorClientConfig config) -> RegisterInstanceCoordinatorStatus {
auto lock = std::lock_guard{coord_data_lock_};
if (std::ranges::any_of(registered_instances_, [&config](const CoordinatorInstance &instance) {
return instance.InstanceName() == config.instance_name;
})) {
return RegisterInstanceCoordinatorStatus::NAME_EXISTS;
}
if (std::ranges::any_of(registered_instances_, [&config](const CoordinatorInstance &instance) {
return instance.SocketAddress() == config.SocketAddress();
})) {
return RegisterInstanceCoordinatorStatus::END_POINT_EXISTS;
}
try {
registered_instances_.emplace_back(this, std::move(config), replica_succ_cb_, replica_fail_cb_);
return RegisterInstanceCoordinatorStatus::SUCCESS;
} catch (CoordinatorRegisterInstanceException const &) {
return RegisterInstanceCoordinatorStatus::RPC_FAILED;
}
}
} // namespace memgraph::coordination
#endif

View File

@ -0,0 +1,84 @@
// Copyright 2024 Memgraph Ltd.
//
// Use of this software is governed by the Business Source License
// included in the file licenses/BSL.txt; by using this file, you agree to be bound by the terms of the Business Source
// License, and you may not use this file except in compliance with the Business Source License.
//
// As of the Change Date specified in that file, in accordance with
// the Business Source License, use of this software will be governed
// by the Apache License, Version 2.0, included in the file
// licenses/APL.txt.
#ifdef MG_ENTERPRISE
#include "coordination/coordinator_instance.hpp"
namespace memgraph::coordination {
CoordinatorInstance::CoordinatorInstance(CoordinatorData *data, CoordinatorClientConfig config,
HealthCheckCallback succ_cb, HealthCheckCallback fail_cb)
: client_(data, std::move(config), std::move(succ_cb), std::move(fail_cb)),
replication_role_(replication_coordination_glue::ReplicationRole::REPLICA),
is_alive_(true) {
if (!client_.DemoteToReplica()) {
throw CoordinatorRegisterInstanceException("Failed to demote instance {} to replica", client_.InstanceName());
}
client_.StartFrequentCheck();
}
auto CoordinatorInstance::OnSuccessPing() -> void {
last_response_time_ = std::chrono::system_clock::now();
is_alive_ = true;
}
auto CoordinatorInstance::OnFailPing() -> bool {
is_alive_ =
std::chrono::duration_cast<std::chrono::seconds>(std::chrono::system_clock::now() - last_response_time_).count() <
CoordinatorClusterConfig::alive_response_time_difference_sec_;
return is_alive_;
}
auto CoordinatorInstance::InstanceName() const -> std::string { return client_.InstanceName(); }
auto CoordinatorInstance::SocketAddress() const -> std::string { return client_.SocketAddress(); }
auto CoordinatorInstance::IsAlive() const -> bool { return is_alive_; }
auto CoordinatorInstance::IsReplica() const -> bool {
return replication_role_ == replication_coordination_glue::ReplicationRole::REPLICA;
}
auto CoordinatorInstance::IsMain() const -> bool {
return replication_role_ == replication_coordination_glue::ReplicationRole::MAIN;
}
auto CoordinatorInstance::PromoteToMain(ReplicationClientsInfo repl_clients_info, HealthCheckCallback main_succ_cb,
HealthCheckCallback main_fail_cb) -> bool {
if (!client_.SendPromoteReplicaToMainRpc(std::move(repl_clients_info))) {
return false;
}
replication_role_ = replication_coordination_glue::ReplicationRole::MAIN;
client_.SetCallbacks(std::move(main_succ_cb), std::move(main_fail_cb));
return true;
}
auto CoordinatorInstance::DemoteToReplica(HealthCheckCallback replica_succ_cb, HealthCheckCallback replica_fail_cb)
-> bool {
if (!client_.DemoteToReplica()) {
return false;
}
replication_role_ = replication_coordination_glue::ReplicationRole::REPLICA;
client_.SetCallbacks(std::move(replica_succ_cb), std::move(replica_fail_cb));
return true;
}
auto CoordinatorInstance::PauseFrequentCheck() -> void { client_.PauseFrequentCheck(); }
auto CoordinatorInstance::ResumeFrequentCheck() -> void { client_.ResumeFrequentCheck(); }
auto CoordinatorInstance::ReplicationClientInfo() const -> CoordinatorClientConfig::ReplicationClientInfo {
return client_.ReplicationClientInfo();
}
} // namespace memgraph::coordination
#endif

View File

@ -36,6 +36,22 @@ void PromoteReplicaToMainRes::Load(PromoteReplicaToMainRes *self, memgraph::slk:
memgraph::slk::Load(self, reader);
}
void SetMainToReplicaReq::Save(const SetMainToReplicaReq &self, memgraph::slk::Builder *builder) {
memgraph::slk::Save(self, builder);
}
void SetMainToReplicaReq::Load(SetMainToReplicaReq *self, memgraph::slk::Reader *reader) {
memgraph::slk::Load(self, reader);
}
void SetMainToReplicaRes::Save(const SetMainToReplicaRes &self, memgraph::slk::Builder *builder) {
memgraph::slk::Save(self, builder);
}
void SetMainToReplicaRes::Load(SetMainToReplicaRes *self, memgraph::slk::Reader *reader) {
memgraph::slk::Load(self, reader);
}
} // namespace coordination
constexpr utils::TypeInfo coordination::PromoteReplicaToMainReq::kType{utils::TypeId::COORD_FAILOVER_REQ,
@ -44,6 +60,12 @@ constexpr utils::TypeInfo coordination::PromoteReplicaToMainReq::kType{utils::Ty
constexpr utils::TypeInfo coordination::PromoteReplicaToMainRes::kType{utils::TypeId::COORD_FAILOVER_RES,
"CoordPromoteReplicaToMainRes", nullptr};
constexpr utils::TypeInfo coordination::SetMainToReplicaReq::kType{utils::TypeId::COORD_SET_REPL_MAIN_REQ,
"CoordDemoteToReplicaReq", nullptr};
constexpr utils::TypeInfo coordination::SetMainToReplicaRes::kType{utils::TypeId::COORD_SET_REPL_MAIN_RES,
"CoordDemoteToReplicaRes", nullptr};
namespace slk {
void Save(const memgraph::coordination::PromoteReplicaToMainRes &self, memgraph::slk::Builder *builder) {
@ -62,6 +84,22 @@ void Load(memgraph::coordination::PromoteReplicaToMainReq *self, memgraph::slk::
memgraph::slk::Load(&self->replication_clients_info, reader);
}
void Save(const memgraph::coordination::SetMainToReplicaReq &self, memgraph::slk::Builder *builder) {
memgraph::slk::Save(self.replication_client_info, builder);
}
void Load(memgraph::coordination::SetMainToReplicaReq *self, memgraph::slk::Reader *reader) {
memgraph::slk::Load(&self->replication_client_info, reader);
}
void Save(const memgraph::coordination::SetMainToReplicaRes &self, memgraph::slk::Builder *builder) {
memgraph::slk::Save(self.success, builder);
}
void Load(memgraph::coordination::SetMainToReplicaRes *self, memgraph::slk::Reader *reader) {
memgraph::slk::Load(&self->success, reader);
}
} // namespace slk
} // namespace memgraph

View File

@ -9,207 +9,75 @@
// by the Apache License, Version 2.0, included in the file
// licenses/APL.txt.
#include "coordination/coordinator_state.hpp"
#include <span>
#include "coordination/coordinator_client.hpp"
#ifdef MG_ENTERPRISE
#include "coordination/coordinator_state.hpp"
#include "coordination/coordinator_config.hpp"
#include "coordination/coordinator_entity_info.hpp"
#include "coordination/register_main_replica_coordinator_status.hpp"
#include "flags/replication.hpp"
#include "spdlog/spdlog.h"
#include "utils/logging.hpp"
#include "utils/variant_helpers.hpp"
#include <atomic>
#include <exception>
#include <optional>
#include <algorithm>
namespace memgraph::coordination {
namespace {
bool CheckName(const std::list<CoordinatorClient> &replicas, const CoordinatorClientConfig &config) {
auto name_matches = [&instance_name = config.instance_name](auto const &replica) {
return replica.InstanceName() == instance_name;
};
return std::any_of(replicas.begin(), replicas.end(), name_matches);
};
} // namespace
CoordinatorState::CoordinatorState() {
MG_ASSERT(!(FLAGS_coordinator && FLAGS_coordinator_server_port),
"Instance cannot be a coordinator and have registered coordinator server.");
spdlog::info("Executing coordinator constructor");
if (FLAGS_coordinator_server_port) {
spdlog::info("Coordinator server port set");
auto const config = CoordinatorServerConfig{
.ip_address = kDefaultReplicationServerIp,
.port = static_cast<uint16_t>(FLAGS_coordinator_server_port),
};
spdlog::info("Executing coordinator constructor main replica");
data_ = CoordinatorMainReplicaData{.coordinator_server_ = std::make_unique<CoordinatorServer>(config)};
}
}
auto CoordinatorState::RegisterReplica(const CoordinatorClientConfig &config)
-> utils::BasicResult<RegisterMainReplicaCoordinatorStatus, CoordinatorClient *> {
const auto name_endpoint_status =
std::visit(memgraph::utils::Overloaded{[](const CoordinatorMainReplicaData & /*coordinator_main_replica_data*/) {
return RegisterMainReplicaCoordinatorStatus::NOT_COORDINATOR;
},
[&config](const CoordinatorData &coordinator_data) {
if (memgraph::coordination::CheckName(
coordinator_data.registered_replicas_, config)) {
return RegisterMainReplicaCoordinatorStatus::NAME_EXISTS;
}
return RegisterMainReplicaCoordinatorStatus::SUCCESS;
}},
data_);
auto CoordinatorState::RegisterInstance(CoordinatorClientConfig config) -> RegisterInstanceCoordinatorStatus {
MG_ASSERT(std::holds_alternative<CoordinatorData>(data_),
"Coordinator cannot register replica since variant holds wrong alternative");
if (name_endpoint_status != RegisterMainReplicaCoordinatorStatus::SUCCESS) {
return name_endpoint_status;
}
// Maybe no need to return client if you can start replica client here
return &std::get<CoordinatorData>(data_).registered_replicas_.emplace_back(config);
}
auto CoordinatorState::RegisterMain(const CoordinatorClientConfig &config)
-> utils::BasicResult<RegisterMainReplicaCoordinatorStatus, CoordinatorClient *> {
const auto endpoint_status = std::visit(
return std::visit(
memgraph::utils::Overloaded{
[](const CoordinatorMainReplicaData & /*coordinator_main_replica_data*/) {
return RegisterMainReplicaCoordinatorStatus::NOT_COORDINATOR;
return RegisterInstanceCoordinatorStatus::NOT_COORDINATOR;
},
[](const CoordinatorData & /*coordinator_data*/) { return RegisterMainReplicaCoordinatorStatus::SUCCESS; }},
[config](CoordinatorData &coordinator_data) { return coordinator_data.RegisterInstance(config); }},
data_);
if (endpoint_status != RegisterMainReplicaCoordinatorStatus::SUCCESS) {
return endpoint_status;
}
auto &registered_main = std::get<CoordinatorData>(data_).registered_main_;
registered_main = std::make_unique<CoordinatorClient>(config);
return registered_main.get();
}
auto CoordinatorState::ShowReplicas() const -> std::vector<CoordinatorEntityInfo> {
auto CoordinatorState::SetInstanceToMain(std::string instance_name) -> SetInstanceToMainCoordinatorStatus {
MG_ASSERT(std::holds_alternative<CoordinatorData>(data_),
"Can't call show replicas on data_, as variant holds wrong alternative");
std::vector<CoordinatorEntityInfo> result;
const auto &registered_replicas = std::get<CoordinatorData>(data_).registered_replicas_;
result.reserve(registered_replicas.size());
std::ranges::transform(registered_replicas, std::back_inserter(result), [](const auto &replica) {
return CoordinatorEntityInfo{replica.InstanceName(), replica.Endpoint()};
});
return result;
"Coordinator cannot register replica since variant holds wrong alternative");
return std::visit(
memgraph::utils::Overloaded{[](const CoordinatorMainReplicaData & /*coordinator_main_replica_data*/) {
return SetInstanceToMainCoordinatorStatus::NOT_COORDINATOR;
},
[&instance_name](CoordinatorData &coordinator_data) {
return coordinator_data.SetInstanceToMain(instance_name);
}},
data_);
}
auto CoordinatorState::ShowMain() const -> std::optional<CoordinatorEntityInfo> {
auto CoordinatorState::ShowInstances() const -> std::vector<CoordinatorInstanceStatus> {
MG_ASSERT(std::holds_alternative<CoordinatorData>(data_),
"Can't call show main on data_, as variant holds wrong alternative");
const auto &registered_main = std::get<CoordinatorData>(data_).registered_main_;
if (registered_main) {
return CoordinatorEntityInfo{registered_main->InstanceName(), registered_main->Endpoint()};
}
return std::nullopt;
"Can't call show instances on data_, as variant holds wrong alternative");
return std::get<CoordinatorData>(data_).ShowInstances();
}
auto CoordinatorState::PingReplicas() const -> std::unordered_map<std::string_view, bool> {
MG_ASSERT(std::holds_alternative<CoordinatorData>(data_),
"Can't call ping replicas on data_, as variant holds wrong alternative");
std::unordered_map<std::string_view, bool> result;
const auto &registered_replicas = std::get<CoordinatorData>(data_).registered_replicas_;
result.reserve(registered_replicas.size());
for (const CoordinatorClient &replica_client : registered_replicas) {
result.emplace(replica_client.InstanceName(), replica_client.DoHealthCheck());
}
return result;
}
auto CoordinatorState::PingMain() const -> std::optional<CoordinatorEntityHealthInfo> {
MG_ASSERT(std::holds_alternative<CoordinatorData>(data_),
"Can't call show main on data_, as variant holds wrong alternative");
const auto &registered_main = std::get<CoordinatorData>(data_).registered_main_;
if (registered_main) {
return CoordinatorEntityHealthInfo{registered_main->InstanceName(), registered_main->DoHealthCheck()};
}
return std::nullopt;
}
auto CoordinatorState::DoFailover() -> DoFailoverStatus {
// 1. MAIN is already down, stop sending frequent checks
// 2. find new replica (coordinator)
// 3. make copy replica's client as potential new main client (coordinator)
// 4. send failover RPC to new main (coordinator and new main)
// 5. exchange old main to new main (coordinator)
// 6. remove replica which was promoted to main from all replicas -> this will shut down RPC frequent check client
// (coordinator)
// 7. for new main start frequent checks (coordinator)
[[nodiscard]] auto CoordinatorState::DoFailover() -> DoFailoverStatus {
MG_ASSERT(std::holds_alternative<CoordinatorData>(data_), "Cannot do failover since variant holds wrong alternative");
using ReplicationClientInfo = CoordinatorClientConfig::ReplicationClientInfo;
// 1.
auto &current_main = std::get<CoordinatorData>(data_).registered_main_;
if (!current_main) {
return DoFailoverStatus::CLUSTER_UNINITIALIZED;
}
if (current_main->DoHealthCheck()) {
return DoFailoverStatus::MAIN_ALIVE;
}
current_main->StopFrequentCheck();
// 2.
// Get all replicas and find new main
auto &registered_replicas = std::get<CoordinatorData>(data_).registered_replicas_;
const auto chosen_replica = std::ranges::find_if(
registered_replicas, [](const CoordinatorClient &replica) { return replica.DoHealthCheck(); });
if (chosen_replica == registered_replicas.end()) {
return DoFailoverStatus::ALL_REPLICAS_DOWN;
}
std::vector<ReplicationClientInfo> repl_clients_info;
repl_clients_info.reserve(registered_replicas.size() - 1);
std::ranges::for_each(registered_replicas, [&chosen_replica, &repl_clients_info](const CoordinatorClient &replica) {
if (replica != *chosen_replica) {
repl_clients_info.emplace_back(replica.ReplicationClientInfo());
}
});
// 3.
// Set on coordinator data of new main
// allocate resources for new main, clear replication info on this replica as main
// set last response time
auto potential_new_main = std::make_unique<CoordinatorClient>(chosen_replica->Config());
potential_new_main->ReplicationClientInfo().reset();
potential_new_main->UpdateTimeCheck(chosen_replica->GetLastTimeResponse());
// 4.
if (!chosen_replica->SendPromoteReplicaToMainRpc(std::move(repl_clients_info))) {
spdlog::error("Sent RPC message, but exception was caught, aborting Failover");
// TODO: new status and rollback all changes that were done...
MG_ASSERT(false, "RPC message failed");
}
// 5.
current_main = std::move(potential_new_main);
// 6. remove old replica
// TODO: Stop pinging chosen_replica before failover.
// Check that it doesn't fail when you call StopFrequentCheck if it is already stopped
registered_replicas.erase(chosen_replica);
// 7.
current_main->StartFrequentCheck();
return DoFailoverStatus::SUCCESS;
auto &coord_state = std::get<CoordinatorData>(data_);
return coord_state.DoFailover();
}
auto CoordinatorState::GetCoordinatorServer() const -> CoordinatorServer & {

View File

@ -16,53 +16,56 @@
#include "coordination/coordinator_config.hpp"
#include "rpc/client.hpp"
#include "utils/scheduler.hpp"
#include "utils/thread_pool.hpp"
#include <string_view>
namespace memgraph::coordination {
class CoordinatorData;
using HealthCheckCallback = std::function<void(CoordinatorData *, std::string_view)>;
using ReplicationClientsInfo = std::vector<ReplClientInfo>;
class CoordinatorClient {
public:
explicit CoordinatorClient(const CoordinatorClientConfig &config);
explicit CoordinatorClient(CoordinatorData *coord_data_, CoordinatorClientConfig config, HealthCheckCallback succ_cb,
HealthCheckCallback fail_cb);
~CoordinatorClient();
~CoordinatorClient() = default;
CoordinatorClient(CoordinatorClient &other) = delete;
CoordinatorClient &operator=(CoordinatorClient const &other) = delete;
CoordinatorClient(CoordinatorClient &) = delete;
CoordinatorClient &operator=(CoordinatorClient const &) = delete;
CoordinatorClient(CoordinatorClient &&) noexcept = delete;
CoordinatorClient &operator=(CoordinatorClient &&) noexcept = delete;
void StartFrequentCheck();
void StopFrequentCheck();
void PauseFrequentCheck();
void ResumeFrequentCheck();
auto DoHealthCheck() const -> bool;
auto SendPromoteReplicaToMainRpc(
std::vector<CoordinatorClientConfig::ReplicationClientInfo> replication_clients_info) const -> bool;
auto InstanceName() const -> std::string;
auto SocketAddress() const -> std::string;
auto InstanceName() const -> std::string_view;
auto Endpoint() const -> io::network::Endpoint const &;
auto Config() const -> CoordinatorClientConfig const &;
auto ReplicationClientInfo() const -> CoordinatorClientConfig::ReplicationClientInfo const &;
auto ReplicationClientInfo() -> std::optional<CoordinatorClientConfig::ReplicationClientInfo> &;
void UpdateTimeCheck(const std::chrono::system_clock::time_point &last_checked_time);
auto GetLastTimeResponse() -> std::chrono::system_clock::time_point;
[[nodiscard]] auto SendPromoteReplicaToMainRpc(ReplicationClientsInfo replication_clients_info) const -> bool;
[[nodiscard]] auto DemoteToReplica() const -> bool;
auto ReplicationClientInfo() const -> ReplClientInfo;
auto SetCallbacks(HealthCheckCallback succ_cb, HealthCheckCallback fail_cb) -> void;
friend bool operator==(CoordinatorClient const &first, CoordinatorClient const &second) {
return first.config_ == second.config_;
}
private:
utils::ThreadPool thread_pool_{1};
utils::Scheduler replica_checker_;
utils::Scheduler instance_checker_;
// TODO: (andi) Pimpl?
communication::ClientContext rpc_context_;
mutable rpc::Client rpc_client_;
CoordinatorClientConfig config_;
std::atomic<std::chrono::system_clock::time_point> last_response_time_{};
static constexpr int alive_response_time_difference_sec_{5};
CoordinatorClientConfig config_;
CoordinatorData *coord_data_;
HealthCheckCallback succ_cb_;
HealthCheckCallback fail_cb_;
};
} // namespace memgraph::coordination

View File

@ -0,0 +1,22 @@
// Copyright 2024 Memgraph Ltd.
//
// Use of this software is governed by the Business Source License
// included in the file licenses/BSL.txt; by using this file, you agree to be bound by the terms of the Business Source
// License, and you may not use this file except in compliance with the Business Source License.
//
// As of the Change Date specified in that file, in accordance with
// the Business Source License, use of this software will be governed
// by the Apache License, Version 2.0, included in the file
// licenses/APL.txt.
#pragma once
#ifdef MG_ENTERPRISE
namespace memgraph::coordination {
struct CoordinatorClusterConfig {
static constexpr int alive_response_time_difference_sec_{5};
};
} // namespace memgraph::coordination
#endif

View File

@ -25,16 +25,14 @@ namespace memgraph::coordination {
inline constexpr auto *kDefaultReplicationServerIp = "0.0.0.0";
struct CoordinatorClientConfig {
const std::string instance_name;
const std::string ip_address;
const uint16_t port{};
std::string instance_name;
std::string ip_address;
uint16_t port{};
std::chrono::seconds health_check_frequency_sec{1};
// Frequency with which coordinator pings main/replicas about it status
const std::chrono::seconds health_check_frequency_sec{1};
auto SocketAddress() const -> std::string { return ip_address + ":" + std::to_string(port); }
// Info which coordinator will send to new main when performing failover
struct ReplicationClientInfo {
// Should be the same as CoordinatorClientConfig's instance_name
std::string instance_name;
replication_coordination_glue::ReplicationMode replication_mode{};
std::string replication_ip_address;
@ -43,20 +41,22 @@ struct CoordinatorClientConfig {
friend bool operator==(ReplicationClientInfo const &, ReplicationClientInfo const &) = default;
};
std::optional<ReplicationClientInfo> replication_client_info;
ReplicationClientInfo replication_client_info;
struct SSL {
const std::string key_file;
const std::string cert_file;
std::string key_file;
std::string cert_file;
friend bool operator==(const SSL &, const SSL &) = default;
};
const std::optional<SSL> ssl;
std::optional<SSL> ssl;
friend bool operator==(CoordinatorClientConfig const &, CoordinatorClientConfig const &) = default;
};
using ReplClientInfo = CoordinatorClientConfig::ReplicationClientInfo;
struct CoordinatorServerConfig {
std::string ip_address;
uint16_t port{};

View File

@ -0,0 +1,53 @@
// Copyright 2024 Memgraph Ltd.
//
// Use of this software is governed by the Business Source License
// included in the file licenses/BSL.txt; by using this file, you agree to be bound by the terms of the Business Source
// License, and you may not use this file except in compliance with the Business Source License.
//
// As of the Change Date specified in that file, in accordance with
// the Business Source License, use of this software will be governed
// by the Apache License, Version 2.0, included in the file
// licenses/APL.txt.
#pragma once
#ifdef MG_ENTERPRISE
#include "coordination/coordinator_instance.hpp"
#include "coordination/coordinator_instance_status.hpp"
#include "coordination/coordinator_server.hpp"
#include "coordination/failover_status.hpp"
#include "coordination/register_main_replica_coordinator_status.hpp"
#include "utils/rw_lock.hpp"
#include "utils/thread_pool.hpp"
#include <list>
namespace memgraph::coordination {
class CoordinatorData {
public:
CoordinatorData();
[[nodiscard]] auto DoFailover() -> DoFailoverStatus;
[[nodiscard]] auto RegisterInstance(CoordinatorClientConfig config) -> RegisterInstanceCoordinatorStatus;
[[nodiscard]] auto SetInstanceToMain(std::string instance_name) -> SetInstanceToMainCoordinatorStatus;
auto ShowInstances() const -> std::vector<CoordinatorInstanceStatus>;
private:
auto ClusterHasAliveMain() const -> bool;
mutable utils::RWLock coord_data_lock_{utils::RWLock::Priority::READ};
HealthCheckCallback main_succ_cb_, main_fail_cb_, replica_succ_cb_, replica_fail_cb_;
// Must be std::list because we rely on pointer stability
std::list<CoordinatorInstance> registered_instances_;
utils::ThreadPool thread_pool_{1};
};
struct CoordinatorMainReplicaData {
std::unique_ptr<CoordinatorServer> coordinator_server_;
};
} // namespace memgraph::coordination
#endif

View File

@ -16,16 +16,16 @@
#include "utils/exceptions.hpp"
namespace memgraph::coordination {
class CoordinatorFailoverException final : public utils::BasicException {
class CoordinatorRegisterInstanceException final : public utils::BasicException {
public:
explicit CoordinatorFailoverException(const std::string_view what) noexcept
: BasicException("Failover didn't complete successfully: " + std::string(what)) {}
explicit CoordinatorRegisterInstanceException(const std::string_view what) noexcept
: BasicException("Failed to create instance: " + std::string(what)) {}
template <class... Args>
explicit CoordinatorFailoverException(fmt::format_string<Args...> fmt, Args &&...args) noexcept
: CoordinatorFailoverException(fmt::format(fmt, std::forward<Args>(args)...)) {}
explicit CoordinatorRegisterInstanceException(fmt::format_string<Args...> fmt, Args &&...args) noexcept
: CoordinatorRegisterInstanceException(fmt::format(fmt, std::forward<Args>(args)...)) {}
SPECIALIZE_GET_EXCEPTION_NAME(CoordinatorFailoverException)
SPECIALIZE_GET_EXCEPTION_NAME(CoordinatorRegisterInstanceException)
};
} // namespace memgraph::coordination

View File

@ -0,0 +1,68 @@
// Copyright 2024 Memgraph Ltd.
//
// Use of this software is governed by the Business Source License
// included in the file licenses/BSL.txt; by using this file, you agree to be bound by the terms of the Business Source
// License, and you may not use this file except in compliance with the Business Source License.
//
// As of the Change Date specified in that file, in accordance with
// the Business Source License, use of this software will be governed
// by the Apache License, Version 2.0, included in the file
// licenses/APL.txt.
#pragma once
#ifdef MG_ENTERPRISE
#include "coordination/coordinator_client.hpp"
#include "coordination/coordinator_cluster_config.hpp"
#include "coordination/coordinator_exceptions.hpp"
#include "replication_coordination_glue/role.hpp"
namespace memgraph::coordination {
class CoordinatorData;
class CoordinatorInstance {
public:
CoordinatorInstance(CoordinatorData *data, CoordinatorClientConfig config, HealthCheckCallback succ_cb,
HealthCheckCallback fail_cb);
CoordinatorInstance(CoordinatorInstance const &other) = delete;
CoordinatorInstance &operator=(CoordinatorInstance const &other) = delete;
CoordinatorInstance(CoordinatorInstance &&other) noexcept = delete;
CoordinatorInstance &operator=(CoordinatorInstance &&other) noexcept = delete;
~CoordinatorInstance() = default;
auto OnSuccessPing() -> void;
auto OnFailPing() -> bool;
auto IsAlive() const -> bool;
auto InstanceName() const -> std::string;
auto SocketAddress() const -> std::string;
auto IsReplica() const -> bool;
auto IsMain() const -> bool;
auto PromoteToMain(ReplicationClientsInfo repl_clients_info, HealthCheckCallback main_succ_cb,
HealthCheckCallback main_fail_cb) -> bool;
auto DemoteToReplica(HealthCheckCallback replica_succ_cb, HealthCheckCallback replica_fail_cb) -> bool;
auto PauseFrequentCheck() -> void;
auto ResumeFrequentCheck() -> void;
auto ReplicationClientInfo() const -> ReplClientInfo;
private:
CoordinatorClient client_;
replication_coordination_glue::ReplicationRole replication_role_;
std::chrono::system_clock::time_point last_response_time_{};
bool is_alive_{false};
friend bool operator==(CoordinatorInstance const &first, CoordinatorInstance const &second) {
return first.client_ == second.client_ && first.replication_role_ == second.replication_role_;
}
};
} // namespace memgraph::coordination
#endif

View File

@ -15,18 +15,15 @@
#include "io/network/endpoint.hpp"
#include <string>
#include <string_view>
namespace memgraph::coordination {
struct CoordinatorEntityInfo {
std::string_view name;
const io::network::Endpoint &endpoint;
};
struct CoordinatorEntityHealthInfo {
std::string_view name;
bool alive;
struct CoordinatorInstanceStatus {
std::string instance_name;
std::string socket_address;
std::string replication_role;
bool is_alive;
};
} // namespace memgraph::coordination

View File

@ -48,6 +48,36 @@ struct PromoteReplicaToMainRes {
using PromoteReplicaToMainRpc = rpc::RequestResponse<PromoteReplicaToMainReq, PromoteReplicaToMainRes>;
struct SetMainToReplicaReq {
static const utils::TypeInfo kType;
static const utils::TypeInfo &GetTypeInfo() { return kType; }
static void Load(SetMainToReplicaReq *self, memgraph::slk::Reader *reader);
static void Save(const SetMainToReplicaReq &self, memgraph::slk::Builder *builder);
explicit SetMainToReplicaReq(CoordinatorClientConfig::ReplicationClientInfo replication_client_info)
: replication_client_info(std::move(replication_client_info)) {}
SetMainToReplicaReq() = default;
CoordinatorClientConfig::ReplicationClientInfo replication_client_info;
};
struct SetMainToReplicaRes {
static const utils::TypeInfo kType;
static const utils::TypeInfo &GetTypeInfo() { return kType; }
static void Load(SetMainToReplicaRes *self, memgraph::slk::Reader *reader);
static void Save(const SetMainToReplicaRes &self, memgraph::slk::Builder *builder);
explicit SetMainToReplicaRes(bool success) : success(success) {}
SetMainToReplicaRes() = default;
bool success;
};
using SetMainToReplicaRpc = rpc::RequestResponse<SetMainToReplicaReq, SetMainToReplicaRes>;
} // namespace memgraph::coordination
// SLK serialization declarations
@ -61,6 +91,14 @@ void Save(const memgraph::coordination::PromoteReplicaToMainReq &self, memgraph:
void Load(memgraph::coordination::PromoteReplicaToMainReq *self, memgraph::slk::Reader *reader);
void Save(const memgraph::coordination::SetMainToReplicaRes &self, memgraph::slk::Builder *builder);
void Load(memgraph::coordination::SetMainToReplicaRes *self, memgraph::slk::Reader *reader);
void Save(const memgraph::coordination::SetMainToReplicaReq &self, memgraph::slk::Builder *builder);
void Load(memgraph::coordination::SetMainToReplicaReq *self, memgraph::slk::Reader *reader);
} // namespace memgraph::slk
#endif

View File

@ -13,29 +13,16 @@
#ifdef MG_ENTERPRISE
#include "coordination/coordinator_client.hpp"
#include "coordination/coordinator_entity_info.hpp"
#include "coordination/coordinator_data.hpp"
#include "coordination/coordinator_instance_status.hpp"
#include "coordination/coordinator_server.hpp"
#include "rpc/server.hpp"
#include "utils/result.hpp"
#include "utils/rw_spin_lock.hpp"
#include "utils/synchronized.hpp"
#include "coordination/failover_status.hpp"
#include "coordination/register_main_replica_coordinator_status.hpp"
#include <list>
#include <variant>
namespace memgraph::coordination {
enum class RegisterMainReplicaCoordinatorStatus : uint8_t {
NAME_EXISTS,
END_POINT_EXISTS,
COULD_NOT_BE_PERSISTED,
NOT_COORDINATOR,
SUCCESS
};
enum class DoFailoverStatus : uint8_t { SUCCESS, ALL_REPLICAS_DOWN, MAIN_ALIVE, CLUSTER_UNINITIALIZED };
class CoordinatorState {
public:
CoordinatorState();
@ -44,49 +31,21 @@ class CoordinatorState {
CoordinatorState(const CoordinatorState &) = delete;
CoordinatorState &operator=(const CoordinatorState &) = delete;
CoordinatorState(CoordinatorState &&other) noexcept : data_(std::move(other.data_)) {}
CoordinatorState(CoordinatorState &&) noexcept = delete;
CoordinatorState &operator=(CoordinatorState &&) noexcept = delete;
CoordinatorState &operator=(CoordinatorState &&other) noexcept {
if (this == &other) {
return *this;
}
data_ = std::move(other.data_);
return *this;
}
[[nodiscard]] auto RegisterInstance(CoordinatorClientConfig config) -> RegisterInstanceCoordinatorStatus;
auto RegisterReplica(const CoordinatorClientConfig &config)
-> utils::BasicResult<RegisterMainReplicaCoordinatorStatus, CoordinatorClient *>;
[[nodiscard]] auto SetInstanceToMain(std::string instance_name) -> SetInstanceToMainCoordinatorStatus;
auto RegisterMain(const CoordinatorClientConfig &config)
-> utils::BasicResult<RegisterMainReplicaCoordinatorStatus, CoordinatorClient *>;
auto ShowReplicas() const -> std::vector<CoordinatorEntityInfo>;
auto PingReplicas() const -> std::unordered_map<std::string_view, bool>;
auto ShowMain() const -> std::optional<CoordinatorEntityInfo>;
auto PingMain() const -> std::optional<CoordinatorEntityHealthInfo>;
auto ShowInstances() const -> std::vector<CoordinatorInstanceStatus>;
// The client code must check that the server exists before calling this method.
auto GetCoordinatorServer() const -> CoordinatorServer &;
auto DoFailover() -> DoFailoverStatus;
[[nodiscard]] auto DoFailover() -> DoFailoverStatus;
private:
// TODO: Data is not thread safe
// Coordinator stores registered replicas and main
struct CoordinatorData {
std::list<CoordinatorClient> registered_replicas_;
std::unique_ptr<CoordinatorClient> registered_main_;
};
// Data which each main and replica stores
struct CoordinatorMainReplicaData {
std::unique_ptr<CoordinatorServer> coordinator_server_;
};
std::variant<CoordinatorData, CoordinatorMainReplicaData> data_;
};

View File

@ -0,0 +1,21 @@
// Copyright 2024 Memgraph Ltd.
//
// Use of this software is governed by the Business Source License
// included in the file licenses/BSL.txt; by using this file, you agree to be bound by the terms of the Business Source
// License, and you may not use this file except in compliance with the Business Source License.
//
// As of the Change Date specified in that file, in accordance with
// the Business Source License, use of this software will be governed
// by the Apache License, Version 2.0, included in the file
// licenses/APL.txt.
#pragma once
#ifdef MG_ENTERPRISE
#include <cstdint>
namespace memgraph::coordination {
enum class DoFailoverStatus : uint8_t { SUCCESS, ALL_REPLICAS_DOWN, MAIN_ALIVE, RPC_FAILED };
} // namespace memgraph::coordination
#endif

View File

@ -0,0 +1,36 @@
// Copyright 2024 Memgraph Ltd.
//
// Use of this software is governed by the Business Source License
// included in the file licenses/BSL.txt; by using this file, you agree to be bound by the terms of the Business Source
// License, and you may not use this file except in compliance with the Business Source License.
//
// As of the Change Date specified in that file, in accordance with
// the Business Source License, use of this software will be governed
// by the Apache License, Version 2.0, included in the file
// licenses/APL.txt.
#pragma once
#ifdef MG_ENTERPRISE
#include <cstdint>
namespace memgraph::coordination {
enum class RegisterInstanceCoordinatorStatus : uint8_t {
NAME_EXISTS,
END_POINT_EXISTS,
NOT_COORDINATOR,
RPC_FAILED,
SUCCESS
};
enum class SetInstanceToMainCoordinatorStatus : uint8_t {
NO_INSTANCE_WITH_NAME,
NOT_COORDINATOR,
SUCCESS,
COULD_NOT_PROMOTE_TO_MAIN,
};
} // namespace memgraph::coordination
#endif

View File

@ -9,6 +9,7 @@
// by the Apache License, Version 2.0, included in the file
// licenses/APL.txt.
#include "coordination/register_main_replica_coordinator_status.hpp"
#ifdef MG_ENTERPRISE
#include "dbms/coordinator_handler.hpp"
@ -19,81 +20,19 @@ namespace memgraph::dbms {
CoordinatorHandler::CoordinatorHandler(DbmsHandler &dbms_handler) : dbms_handler_(dbms_handler) {}
auto CoordinatorHandler::RegisterReplicaOnCoordinator(const memgraph::coordination::CoordinatorClientConfig &config)
-> utils::BasicResult<RegisterMainReplicaCoordinatorStatus> {
auto instance_client = dbms_handler_.CoordinatorState().RegisterReplica(config);
using repl_status = memgraph::coordination::RegisterMainReplicaCoordinatorStatus;
using dbms_status = memgraph::dbms::RegisterMainReplicaCoordinatorStatus;
if (instance_client.HasError()) {
switch (instance_client.GetError()) {
case memgraph::coordination::RegisterMainReplicaCoordinatorStatus::NOT_COORDINATOR:
MG_ASSERT(false, "Only coordinator instance can register main and replica!");
return {};
case repl_status::NAME_EXISTS:
return dbms_status::NAME_EXISTS;
case repl_status::END_POINT_EXISTS:
return dbms_status::END_POINT_EXISTS;
case repl_status::COULD_NOT_BE_PERSISTED:
return dbms_status::COULD_NOT_BE_PERSISTED;
case repl_status::SUCCESS:
break;
}
}
instance_client.GetValue()->StartFrequentCheck();
return {};
auto CoordinatorHandler::RegisterInstance(memgraph::coordination::CoordinatorClientConfig config)
-> coordination::RegisterInstanceCoordinatorStatus {
return dbms_handler_.CoordinatorState().RegisterInstance(config);
}
auto CoordinatorHandler::RegisterMainOnCoordinator(const memgraph::coordination::CoordinatorClientConfig &config)
-> utils::BasicResult<RegisterMainReplicaCoordinatorStatus> {
auto instance_client = dbms_handler_.CoordinatorState().RegisterMain(config);
if (instance_client.HasError()) switch (instance_client.GetError()) {
case memgraph::coordination::RegisterMainReplicaCoordinatorStatus::NOT_COORDINATOR:
MG_ASSERT(false, "Only coordinator instance can register main and replica!");
case memgraph::coordination::RegisterMainReplicaCoordinatorStatus::NAME_EXISTS:
return memgraph::dbms::RegisterMainReplicaCoordinatorStatus::NAME_EXISTS;
case memgraph::coordination::RegisterMainReplicaCoordinatorStatus::END_POINT_EXISTS:
return memgraph::dbms::RegisterMainReplicaCoordinatorStatus::END_POINT_EXISTS;
case memgraph::coordination::RegisterMainReplicaCoordinatorStatus::COULD_NOT_BE_PERSISTED:
return memgraph::dbms::RegisterMainReplicaCoordinatorStatus::COULD_NOT_BE_PERSISTED;
case memgraph::coordination::RegisterMainReplicaCoordinatorStatus::SUCCESS:
break;
}
instance_client.GetValue()->StartFrequentCheck();
return {};
auto CoordinatorHandler::SetInstanceToMain(std::string instance_name)
-> coordination::SetInstanceToMainCoordinatorStatus {
return dbms_handler_.CoordinatorState().SetInstanceToMain(std::move(instance_name));
}
auto CoordinatorHandler::ShowReplicasOnCoordinator() const -> std::vector<coordination::CoordinatorEntityInfo> {
return dbms_handler_.CoordinatorState().ShowReplicas();
auto CoordinatorHandler::ShowInstances() const -> std::vector<coordination::CoordinatorInstanceStatus> {
return dbms_handler_.CoordinatorState().ShowInstances();
}
auto CoordinatorHandler::PingReplicasOnCoordinator() const -> std::unordered_map<std::string_view, bool> {
return dbms_handler_.CoordinatorState().PingReplicas();
}
auto CoordinatorHandler::ShowMainOnCoordinator() const -> std::optional<coordination::CoordinatorEntityInfo> {
return dbms_handler_.CoordinatorState().ShowMain();
}
auto CoordinatorHandler::PingMainOnCoordinator() const -> std::optional<coordination::CoordinatorEntityHealthInfo> {
return dbms_handler_.CoordinatorState().PingMain();
}
auto CoordinatorHandler::DoFailover() const -> DoFailoverStatus {
auto status = dbms_handler_.CoordinatorState().DoFailover();
switch (status) {
case memgraph::coordination::DoFailoverStatus::ALL_REPLICAS_DOWN:
return memgraph::dbms::DoFailoverStatus::ALL_REPLICAS_DOWN;
case memgraph::coordination::DoFailoverStatus::SUCCESS:
return memgraph::dbms::DoFailoverStatus::SUCCESS;
case memgraph::coordination::DoFailoverStatus::MAIN_ALIVE:
return memgraph::dbms::DoFailoverStatus::MAIN_ALIVE;
case memgraph::coordination::DoFailoverStatus::CLUSTER_UNINITIALIZED:
return memgraph::dbms::DoFailoverStatus::CLUSTER_UNINITIALIZED;
}
}
} // namespace memgraph::dbms
#endif

View File

@ -15,49 +15,29 @@
#include "utils/result.hpp"
#include "coordination/coordinator_config.hpp"
#include "coordination/coordinator_instance_status.hpp"
#include "coordination/failover_status.hpp"
#include "coordination/register_main_replica_coordinator_status.hpp"
#include <cstdint>
#include <optional>
#include <vector>
namespace memgraph::coordination {
struct CoordinatorEntityInfo;
struct CoordinatorEntityHealthInfo;
struct CoordinatorClientConfig;
} // namespace memgraph::coordination
namespace memgraph::dbms {
enum class RegisterMainReplicaCoordinatorStatus : uint8_t {
NAME_EXISTS,
END_POINT_EXISTS,
COULD_NOT_BE_PERSISTED,
NOT_COORDINATOR,
SUCCESS
};
enum class DoFailoverStatus : uint8_t { SUCCESS, ALL_REPLICAS_DOWN, MAIN_ALIVE, CLUSTER_UNINITIALIZED };
class DbmsHandler;
class CoordinatorHandler {
public:
explicit CoordinatorHandler(DbmsHandler &dbms_handler);
auto RegisterReplicaOnCoordinator(const memgraph::coordination::CoordinatorClientConfig &config)
-> utils::BasicResult<RegisterMainReplicaCoordinatorStatus>;
auto RegisterInstance(coordination::CoordinatorClientConfig config)
-> coordination::RegisterInstanceCoordinatorStatus;
auto RegisterMainOnCoordinator(const memgraph::coordination::CoordinatorClientConfig &config)
-> utils::BasicResult<RegisterMainReplicaCoordinatorStatus>;
auto SetInstanceToMain(std::string instance_name) -> coordination::SetInstanceToMainCoordinatorStatus;
auto ShowReplicasOnCoordinator() const -> std::vector<memgraph::coordination::CoordinatorEntityInfo>;
auto ShowMainOnCoordinator() const -> std::optional<memgraph::coordination::CoordinatorEntityInfo>;
auto PingReplicasOnCoordinator() const -> std::unordered_map<std::string_view, bool>;
auto PingMainOnCoordinator() const -> std::optional<memgraph::coordination::CoordinatorEntityHealthInfo>;
auto DoFailover() const -> DoFailoverStatus;
auto ShowInstances() const -> std::vector<coordination::CoordinatorInstanceStatus>;
private:
DbmsHandler &dbms_handler_;

View File

@ -19,6 +19,8 @@
#include "dbms/dbms_handler.hpp"
#include "dbms/replication_client.hpp"
#include "range/v3/view.hpp"
namespace memgraph::dbms {
void CoordinatorHandlers::Register(DbmsHandler &dbms_handler) {
@ -26,9 +28,42 @@ void CoordinatorHandlers::Register(DbmsHandler &dbms_handler) {
server.Register<coordination::PromoteReplicaToMainRpc>(
[&dbms_handler](slk::Reader *req_reader, slk::Builder *res_builder) -> void {
spdlog::info("Received PromoteReplicaToMainRpc from coordinator server");
spdlog::info("Received PromoteReplicaToMainRpc");
CoordinatorHandlers::PromoteReplicaToMainHandler(dbms_handler, req_reader, res_builder);
});
server.Register<coordination::SetMainToReplicaRpc>(
[&dbms_handler](slk::Reader *req_reader, slk::Builder *res_builder) -> void {
spdlog::info("Received SetMainToReplicaRpc from coordinator server");
CoordinatorHandlers::SetMainToReplicaHandler(dbms_handler, req_reader, res_builder);
});
}
void CoordinatorHandlers::SetMainToReplicaHandler(DbmsHandler &dbms_handler, slk::Reader *req_reader,
slk::Builder *res_builder) {
auto &repl_state = dbms_handler.ReplicationState();
spdlog::info("Executing SetMainToReplicaHandler");
if (repl_state.IsReplica()) {
spdlog::error("Setting to replica must be performed on main.");
slk::Save(coordination::SetMainToReplicaRes{false}, res_builder);
return;
}
coordination::SetMainToReplicaReq req;
slk::Load(&req, req_reader);
const replication::ReplicationServerConfig clients_config{
.ip_address = req.replication_client_info.replication_ip_address,
.port = req.replication_client_info.replication_port};
if (bool success = memgraph::dbms::SetReplicationRoleReplica(dbms_handler, clients_config); !success) {
spdlog::error("Setting main to replica failed!");
slk::Save(coordination::PromoteReplicaToMainRes{false}, res_builder);
return;
}
slk::Save(coordination::PromoteReplicaToMainRes{true}, res_builder);
}
void CoordinatorHandlers::PromoteReplicaToMainHandler(DbmsHandler &dbms_handler, slk::Reader *req_reader,
@ -36,11 +71,15 @@ void CoordinatorHandlers::PromoteReplicaToMainHandler(DbmsHandler &dbms_handler,
auto &repl_state = dbms_handler.ReplicationState();
if (!repl_state.IsReplica()) {
spdlog::error("Failover must be performed on replica!");
spdlog::error("Only replica can be promoted to main!");
slk::Save(coordination::PromoteReplicaToMainRes{false}, res_builder);
return;
}
auto repl_server_config = std::get<replication::RoleReplicaData>(repl_state.ReplicationData()).config;
// This can fail because of disk. If it does, the cluster state could get inconsistent.
// We don't handle disk issues.
if (bool success = memgraph::dbms::DoReplicaToMainPromotion(dbms_handler); !success) {
spdlog::error("Promoting replica to main failed!");
slk::Save(coordination::PromoteReplicaToMainRes{false}, res_builder);
@ -50,34 +89,40 @@ void CoordinatorHandlers::PromoteReplicaToMainHandler(DbmsHandler &dbms_handler,
coordination::PromoteReplicaToMainReq req;
slk::Load(&req, req_reader);
std::vector<replication::ReplicationClientConfig> clients_config;
clients_config.reserve(req.replication_clients_info.size());
std::ranges::transform(req.replication_clients_info, std::back_inserter(clients_config),
[](const auto &repl_info_config) {
return replication::ReplicationClientConfig{
.name = repl_info_config.instance_name,
.mode = repl_info_config.replication_mode,
.ip_address = repl_info_config.replication_ip_address,
.port = repl_info_config.replication_port,
};
});
auto const converter = [](const auto &repl_info_config) {
return replication::ReplicationClientConfig{
.name = repl_info_config.instance_name,
.mode = repl_info_config.replication_mode,
.ip_address = repl_info_config.replication_ip_address,
.port = repl_info_config.replication_port,
};
};
std::ranges::for_each(clients_config, [&dbms_handler, &repl_state, &res_builder](const auto &config) {
MG_ASSERT(
std::get<replication::RoleMainData>(repl_state.ReplicationData()).registered_replicas_.empty(),
"No replicas should be registered after promoting replica to main and before registering replication clients!");
// registering replicas
for (auto const &config : req.replication_clients_info | ranges::views::transform(converter)) {
auto instance_client = repl_state.RegisterReplica(config);
if (instance_client.HasError()) {
switch (instance_client.GetError()) {
// Can't happen, we are already replica
case memgraph::replication::RegisterReplicaError::NOT_MAIN:
spdlog::error("Failover must be performed to main!");
slk::Save(coordination::PromoteReplicaToMainRes{false}, res_builder);
return;
// Can't happen, checked on the coordinator side
case memgraph::replication::RegisterReplicaError::NAME_EXISTS:
spdlog::error("Replica with the same name already exists!");
slk::Save(coordination::PromoteReplicaToMainRes{false}, res_builder);
return;
case memgraph::replication::RegisterReplicaError::END_POINT_EXISTS:
// Can't happen, checked on the coordinator side
case memgraph::replication::RegisterReplicaError::ENDPOINT_EXISTS:
spdlog::error("Replica with the same endpoint already exists!");
slk::Save(coordination::PromoteReplicaToMainRes{false}, res_builder);
return;
// We don't handle disk issues
case memgraph::replication::RegisterReplicaError::COULD_NOT_BE_PERSISTED:
spdlog::error("Registered replica could not be persisted!");
slk::Save(coordination::PromoteReplicaToMainRes{false}, res_builder);
@ -86,18 +131,22 @@ void CoordinatorHandlers::PromoteReplicaToMainHandler(DbmsHandler &dbms_handler,
break;
}
}
auto &instance_client_ref = *instance_client.GetValue();
const bool all_clients_good = memgraph::dbms::RegisterAllDatabasesClients(dbms_handler, instance_client_ref);
if (!all_clients_good) {
spdlog::error("Failed to register all databases to the REPLICA \"{}\"", config.name);
slk::Save(coordination::PromoteReplicaToMainRes{false}, res_builder);
return;
if (!allow_mt_repl && dbms_handler.All().size() > 1) {
spdlog::warn("Multi-tenant replication is currently not supported!");
}
auto &instance_client_ref = *instance_client.GetValue();
// Update system before enabling individual storage <-> replica clients
dbms_handler.SystemRestore(instance_client_ref);
// TODO: (andi) Policy for register all databases
// Will be resolved after deciding about choosing new replica
const bool all_clients_good = memgraph::dbms::RegisterAllDatabasesClients(dbms_handler, instance_client_ref);
MG_ASSERT(all_clients_good, "Failed to register one or more databases to the REPLICA \"{}\".", config.name);
StartReplicaClient(dbms_handler, instance_client_ref);
});
}
slk::Save(coordination::PromoteReplicaToMainRes{true}, res_builder);
}

View File

@ -26,6 +26,7 @@ class CoordinatorHandlers {
private:
static void PromoteReplicaToMainHandler(DbmsHandler &dbms_handler, slk::Reader *req_reader,
slk::Builder *res_builder);
static void SetMainToReplicaHandler(DbmsHandler &dbms_handler, slk::Reader *req_reader, slk::Builder *res_builder);
};
} // namespace memgraph::dbms

View File

@ -110,7 +110,7 @@ class Database {
* @param force_directory Use the configured directory, do not try to decipher the multi-db version
* @return DatabaseInfo
*/
DatabaseInfo GetInfo(bool force_directory, replication::ReplicationRole replication_role) const {
DatabaseInfo GetInfo(bool force_directory, replication_coordination_glue::ReplicationRole replication_role) const {
DatabaseInfo info;
info.storage_info = storage_->GetInfo(force_directory, replication_role);
info.triggers = trigger_store_.GetTriggerInfo().size();

View File

@ -23,7 +23,7 @@
#include "storage/v2/inmemory/storage.hpp"
#include "storage/v2/inmemory/unique_constraints.hpp"
using memgraph::replication::ReplicationRole;
using memgraph::replication_coordination_glue::ReplicationRole;
using memgraph::storage::Delta;
using memgraph::storage::EdgeAccessor;
using memgraph::storage::EdgeRef;

View File

@ -38,8 +38,8 @@ std::string RegisterReplicaErrorToString(RegisterReplicaError error) {
using enum RegisterReplicaError;
case NAME_EXISTS:
return "NAME_EXISTS";
case END_POINT_EXISTS:
return "END_POINT_EXISTS";
case ENDPOINT_EXISTS:
return "ENDPOINT_EXISTS";
case CONNECTION_FAILED:
return "CONNECTION_FAILED";
case COULD_NOT_BE_PERSISTED:
@ -100,16 +100,16 @@ auto ReplicationHandler::RegisterReplica(const memgraph::replication::Replicatio
-> memgraph::utils::BasicResult<RegisterReplicaError> {
MG_ASSERT(dbms_handler_.ReplicationState().IsMain(), "Only main instance can register a replica!");
auto instance_client = dbms_handler_.ReplicationState().RegisterReplica(config);
if (instance_client.HasError()) {
switch (instance_client.GetError()) {
auto maybe_client = dbms_handler_.ReplicationState().RegisterReplica(config);
if (maybe_client.HasError()) {
switch (maybe_client.GetError()) {
case memgraph::replication::RegisterReplicaError::NOT_MAIN:
MG_ASSERT(false, "Only main instance can register a replica!");
return {};
case memgraph::replication::RegisterReplicaError::NAME_EXISTS:
return memgraph::dbms::RegisterReplicaError::NAME_EXISTS;
case memgraph::replication::RegisterReplicaError::END_POINT_EXISTS:
return memgraph::dbms::RegisterReplicaError::END_POINT_EXISTS;
case memgraph::replication::RegisterReplicaError::ENDPOINT_EXISTS:
return memgraph::dbms::RegisterReplicaError::ENDPOINT_EXISTS;
case memgraph::replication::RegisterReplicaError::COULD_NOT_BE_PERSISTED:
return memgraph::dbms::RegisterReplicaError::COULD_NOT_BE_PERSISTED;
case memgraph::replication::RegisterReplicaError::SUCCESS:
@ -123,14 +123,14 @@ auto ReplicationHandler::RegisterReplica(const memgraph::replication::Replicatio
#ifdef MG_ENTERPRISE
// Update system before enabling individual storage <-> replica clients
dbms_handler_.SystemRestore(*instance_client.GetValue());
dbms_handler_.SystemRestore(*maybe_client.GetValue());
#endif
const auto dbms_error = memgraph::dbms::HandleErrorOnReplicaClient(instance_client);
const auto dbms_error = memgraph::dbms::HandleRegisterReplicaStatus(maybe_client);
if (dbms_error.has_value()) {
return *dbms_error;
}
auto &instance_client_ptr = instance_client.GetValue();
auto &instance_client_ptr = maybe_client.GetValue();
const bool all_clients_good = memgraph::dbms::RegisterAllDatabasesClients(dbms_handler_, *instance_client_ptr);
// NOTE Currently if any databases fails, we revert back
@ -141,7 +141,7 @@ auto ReplicationHandler::RegisterReplica(const memgraph::replication::Replicatio
}
// No client error, start instance level client
StartReplicaClient(dbms_handler_, *instance_client.GetValue());
StartReplicaClient(dbms_handler_, *instance_client_ptr);
return {};
}
@ -169,7 +169,7 @@ auto ReplicationHandler::UnregisterReplica(std::string_view name) -> UnregisterR
dbms_handler_.ReplicationState().ReplicationData());
}
auto ReplicationHandler::GetRole() const -> memgraph::replication::ReplicationRole {
auto ReplicationHandler::GetRole() const -> memgraph::replication_coordination_glue::ReplicationRole {
return dbms_handler_.ReplicationState().GetRole();
}

View File

@ -11,8 +11,8 @@
#pragma once
#include "replication_coordination_glue/role.hpp"
#include "dbms/database.hpp"
#include "replication/role.hpp"
#include "utils/result.hpp"
namespace memgraph::replication {
@ -25,7 +25,7 @@ namespace memgraph::dbms {
class DbmsHandler;
enum class RegisterReplicaError : uint8_t { NAME_EXISTS, END_POINT_EXISTS, CONNECTION_FAILED, COULD_NOT_BE_PERSISTED };
enum class RegisterReplicaError : uint8_t { NAME_EXISTS, ENDPOINT_EXISTS, CONNECTION_FAILED, COULD_NOT_BE_PERSISTED };
enum class UnregisterReplicaResult : uint8_t {
NOT_MAIN,
@ -53,7 +53,7 @@ struct ReplicationHandler {
auto UnregisterReplica(std::string_view name) -> UnregisterReplicaResult;
// Helper pass-through (TODO: remove)
auto GetRole() const -> memgraph::replication::ReplicationRole;
auto GetRole() const -> memgraph::replication_coordination_glue::ReplicationRole;
bool IsMain() const;
bool IsReplica() const;

View File

@ -18,6 +18,7 @@
namespace memgraph::dbms {
inline bool DoReplicaToMainPromotion(dbms::DbmsHandler &dbms_handler) {
auto &repl_state = dbms_handler.ReplicationState();
// STEP 1) bring down all REPLICA servers
dbms_handler.ForEach([](DatabaseAccess db_acc) {
auto *storage = db_acc->storage();
@ -27,7 +28,7 @@ inline bool DoReplicaToMainPromotion(dbms::DbmsHandler &dbms_handler) {
// STEP 2) Change to MAIN
// TODO: restore replication servers if false?
if (!dbms_handler.ReplicationState().SetReplicationRoleMain()) {
if (!repl_state.SetReplicationRoleMain()) {
// TODO: Handle recovery on failure???
return false;
}
@ -43,6 +44,38 @@ inline bool DoReplicaToMainPromotion(dbms::DbmsHandler &dbms_handler) {
return true;
};
inline bool SetReplicationRoleReplica(dbms::DbmsHandler &dbms_handler,
const memgraph::replication::ReplicationServerConfig &config) {
if (dbms_handler.ReplicationState().IsReplica()) {
return false;
}
// TODO StorageState needs to be synched. Could have a dangling reference if someone adds a database as we are
// deleting the replica.
// Remove database specific clients
dbms_handler.ForEach([&](DatabaseAccess db_acc) {
auto *storage = db_acc->storage();
storage->repl_storage_state_.replication_clients_.WithLock([](auto &clients) { clients.clear(); });
});
// Remove instance level clients
std::get<replication::RoleMainData>(dbms_handler.ReplicationState().ReplicationData()).registered_replicas_.clear();
// Creates the server
dbms_handler.ReplicationState().SetReplicationRoleReplica(config);
// Start
const auto success = std::visit(utils::Overloaded{[](replication::RoleMainData const &) {
// ASSERT
return false;
},
[&dbms_handler](replication::RoleReplicaData const &data) {
return StartRpcServer(dbms_handler, data);
}},
dbms_handler.ReplicationState().ReplicationData());
// TODO Handle error (restore to main?)
return success;
}
inline bool RegisterAllDatabasesClients(dbms::DbmsHandler &dbms_handler,
replication::ReplicationClient &instance_client) {
if (!allow_mt_repl && dbms_handler.All().size() > 1) {
@ -69,7 +102,7 @@ inline bool RegisterAllDatabasesClients(dbms::DbmsHandler &dbms_handler,
// MAYBE_BEHIND isn't a statement of the current state, this is the default value
// Failed to start due an error like branching of MAIN and REPLICA
if (client->State() == storage::replication::ReplicaState::MAYBE_BEHIND) {
return false;
return false; // TODO: sometimes we need to still add to storage_clients
}
storage_clients.push_back(std::move(client));
return true;
@ -79,7 +112,7 @@ inline bool RegisterAllDatabasesClients(dbms::DbmsHandler &dbms_handler,
return all_clients_good;
}
inline std::optional<RegisterReplicaError> HandleErrorOnReplicaClient(
inline std::optional<RegisterReplicaError> HandleRegisterReplicaStatus(
utils::BasicResult<replication::RegisterReplicaError, replication::ReplicationClient *> &instance_client) {
if (instance_client.HasError()) switch (instance_client.GetError()) {
case replication::RegisterReplicaError::NOT_MAIN:
@ -87,8 +120,8 @@ inline std::optional<RegisterReplicaError> HandleErrorOnReplicaClient(
return {};
case replication::RegisterReplicaError::NAME_EXISTS:
return dbms::RegisterReplicaError::NAME_EXISTS;
case replication::RegisterReplicaError::END_POINT_EXISTS:
return dbms::RegisterReplicaError::END_POINT_EXISTS;
case replication::RegisterReplicaError::ENDPOINT_EXISTS:
return dbms::RegisterReplicaError::ENDPOINT_EXISTS;
case replication::RegisterReplicaError::COULD_NOT_BE_PERSISTED:
return dbms::RegisterReplicaError::COULD_NOT_BE_PERSISTED;
case replication::RegisterReplicaError::SUCCESS:

View File

@ -32,7 +32,7 @@
#include "utils/timer.hpp"
#include "version.hpp"
using memgraph::replication::ReplicationRole;
using memgraph::replication_coordination_glue::ReplicationRole;
bool ValidateControlCharacter(const char *flagname, const std::string &value) {
if (value.empty()) {

View File

@ -3072,14 +3072,11 @@ class CoordinatorQuery : public memgraph::query::Query {
const utils::TypeInfo &GetTypeInfo() const override { return kType; }
enum class Action {
REGISTER_MAIN_COORDINATOR_SERVER,
REGISTER_REPLICA_COORDINATOR_SERVER,
REGISTER_INSTANCE,
SET_INSTANCE_TO_MAIN,
SHOW_REPLICATION_CLUSTER,
DO_FAILOVER
};
enum class ReplicationRole { MAIN, REPLICA };
enum class SyncMode { SYNC, ASYNC };
CoordinatorQuery() = default;
@ -3087,18 +3084,17 @@ class CoordinatorQuery : public memgraph::query::Query {
DEFVISITABLE(QueryVisitor<void>);
memgraph::query::CoordinatorQuery::Action action_;
memgraph::query::CoordinatorQuery::ReplicationRole role_;
std::string instance_name_;
memgraph::query::Expression *socket_address_{nullptr};
memgraph::query::Expression *replication_socket_address_{nullptr};
memgraph::query::Expression *coordinator_socket_address_{nullptr};
memgraph::query::CoordinatorQuery::SyncMode sync_mode_;
CoordinatorQuery *Clone(AstStorage *storage) const override {
auto *object = storage->Create<CoordinatorQuery>();
object->action_ = action_;
object->role_ = role_;
object->instance_name_ = instance_name_;
object->socket_address_ = socket_address_ ? socket_address_->Clone(storage) : nullptr;
object->replication_socket_address_ =
replication_socket_address_ ? replication_socket_address_->Clone(storage) : nullptr;
object->sync_mode_ = sync_mode_;
object->coordinator_socket_address_ =
coordinator_socket_address_ ? coordinator_socket_address_->Clone(storage) : nullptr;

View File

@ -375,10 +375,28 @@ antlrcpp::Any CypherMainVisitor::visitRegisterReplica(MemgraphCypher::RegisterRe
}
// License check is done in the interpreter.
antlrcpp::Any CypherMainVisitor::visitRegisterCoordinatorServer(MemgraphCypher::RegisterCoordinatorServerContext *ctx) {
MG_ASSERT(ctx->children.size() == 1, "RegisterCoordinatorServerQuery should have exactly one child!");
auto *coordinator_query = std::any_cast<CoordinatorQuery *>(ctx->children[0]->accept(this));
query_ = coordinator_query;
antlrcpp::Any CypherMainVisitor::visitRegisterInstanceOnCoordinator(
MemgraphCypher::RegisterInstanceOnCoordinatorContext *ctx) {
auto *coordinator_query = storage_->Create<CoordinatorQuery>();
if (!ctx->replicationSocketAddress()->literal()->StringLiteral()) {
throw SemanticException("Replication socket address should be a string literal!");
}
if (!ctx->coordinatorSocketAddress()->literal()->StringLiteral()) {
throw SemanticException("Coordinator socket address should be a string literal!");
}
coordinator_query->action_ = CoordinatorQuery::Action::REGISTER_INSTANCE;
coordinator_query->replication_socket_address_ =
std::any_cast<Expression *>(ctx->replicationSocketAddress()->accept(this));
coordinator_query->coordinator_socket_address_ =
std::any_cast<Expression *>(ctx->coordinatorSocketAddress()->accept(this));
coordinator_query->instance_name_ = std::any_cast<std::string>(ctx->instanceName()->symbolicName()->accept(this));
if (ctx->ASYNC()) {
coordinator_query->sync_mode_ = memgraph::query::CoordinatorQuery::SyncMode::ASYNC;
} else {
coordinator_query->sync_mode_ = memgraph::query::CoordinatorQuery::SyncMode::SYNC;
}
return coordinator_query;
}
@ -389,48 +407,6 @@ antlrcpp::Any CypherMainVisitor::visitShowReplicationCluster(MemgraphCypher::Sho
return coordinator_query;
}
// License check is done in the interpreter
antlrcpp::Any CypherMainVisitor::visitRegisterReplicaCoordinatorServer(
MemgraphCypher::RegisterReplicaCoordinatorServerContext *ctx) {
auto *coordinator_query = storage_->Create<CoordinatorQuery>();
if (!ctx->socketAddress()->literal()->StringLiteral()) {
throw SemanticException("Socket address should be a string literal!");
}
if (!ctx->coordinatorSocketAddress()->literal()->StringLiteral()) {
throw SemanticException("Coordinator socket address should be a string literal!");
}
coordinator_query->action_ = CoordinatorQuery::Action::REGISTER_REPLICA_COORDINATOR_SERVER;
coordinator_query->role_ = CoordinatorQuery::ReplicationRole::REPLICA;
coordinator_query->socket_address_ = std::any_cast<Expression *>(ctx->socketAddress()->accept(this));
coordinator_query->coordinator_socket_address_ =
std::any_cast<Expression *>(ctx->coordinatorSocketAddress()->accept(this));
coordinator_query->instance_name_ = std::any_cast<std::string>(ctx->instanceName()->symbolicName()->accept(this));
if (ctx->SYNC()) {
coordinator_query->sync_mode_ = memgraph::query::CoordinatorQuery::SyncMode::SYNC;
} else if (ctx->ASYNC()) {
coordinator_query->sync_mode_ = memgraph::query::CoordinatorQuery::SyncMode::ASYNC;
}
return coordinator_query;
}
// License check is done in the interpreter
antlrcpp::Any CypherMainVisitor::visitRegisterMainCoordinatorServer(
MemgraphCypher::RegisterMainCoordinatorServerContext *ctx) {
if (!ctx->coordinatorSocketAddress()->literal()->StringLiteral()) {
throw SemanticException("Coordinator socket address should be a string literal!");
}
auto *coordinator_query = storage_->Create<CoordinatorQuery>();
coordinator_query->action_ = CoordinatorQuery::Action::REGISTER_MAIN_COORDINATOR_SERVER;
coordinator_query->role_ = CoordinatorQuery::ReplicationRole::MAIN;
coordinator_query->coordinator_socket_address_ =
std::any_cast<Expression *>(ctx->coordinatorSocketAddress()->accept(this));
coordinator_query->instance_name_ = std::any_cast<std::string>(ctx->instanceName()->symbolicName()->accept(this));
return coordinator_query;
}
antlrcpp::Any CypherMainVisitor::visitDropReplica(MemgraphCypher::DropReplicaContext *ctx) {
auto *replication_query = storage_->Create<ReplicationQuery>();
replication_query->action_ = ReplicationQuery::Action::DROP_REPLICA;
@ -445,9 +421,10 @@ antlrcpp::Any CypherMainVisitor::visitShowReplicas(MemgraphCypher::ShowReplicasC
}
// License check is done in the interpreter
antlrcpp::Any CypherMainVisitor::visitDoFailover(MemgraphCypher::DoFailoverContext * /*ctx*/) {
antlrcpp::Any CypherMainVisitor::visitSetInstanceToMain(MemgraphCypher::SetInstanceToMainContext *ctx) {
auto *coordinator_query = storage_->Create<CoordinatorQuery>();
coordinator_query->action_ = CoordinatorQuery::Action::DO_FAILOVER;
coordinator_query->action_ = CoordinatorQuery::Action::SET_INSTANCE_TO_MAIN;
coordinator_query->instance_name_ = std::any_cast<std::string>(ctx->instanceName()->symbolicName()->accept(this));
query_ = coordinator_query;
return coordinator_query;
}

View File

@ -241,29 +241,18 @@ class CypherMainVisitor : public antlropencypher::MemgraphCypherBaseVisitor {
/**
* @return CoordinatorQuery*
*/
antlrcpp::Any visitRegisterCoordinatorServer(MemgraphCypher::RegisterCoordinatorServerContext *ctx) override;
antlrcpp::Any visitRegisterInstanceOnCoordinator(MemgraphCypher::RegisterInstanceOnCoordinatorContext *ctx) override;
/**
* @return CoordinatorQuery*
*/
antlrcpp::Any visitRegisterMainCoordinatorServer(MemgraphCypher::RegisterMainCoordinatorServerContext *ctx) override;
/**
* @return CoordinatorQuery*
*/
antlrcpp::Any visitRegisterReplicaCoordinatorServer(
MemgraphCypher::RegisterReplicaCoordinatorServerContext *ctx) override;
antlrcpp::Any visitSetInstanceToMain(MemgraphCypher::SetInstanceToMainContext *ctx) override;
/**
* @return CoordinatorQuery*
*/
antlrcpp::Any visitShowReplicationCluster(MemgraphCypher::ShowReplicationClusterContext *ctx) override;
/**
* @return CoordinatorQuery*
*/
antlrcpp::Any visitDoFailover(MemgraphCypher::DoFailoverContext *ctx) override;
/**
* @return LockPathQuery*
*/

View File

@ -102,6 +102,7 @@ FILTER : F I L T E R ;
IN : I N ;
INDEX : I N D E X ;
INFO : I N F O ;
INSTANCE : I N S T A N C E ;
IS : I S ;
KB : K B ;
KEY : K E Y ;
@ -122,6 +123,7 @@ PROCEDURE : P R O C E D U R E ;
PROFILE : P R O F I L E ;
QUERY : Q U E R Y ;
REDUCE : R E D U C E ;
REGISTER : R E G I S T E R;
REMOVE : R E M O V E ;
RETURN : R E T U R N ;
SET : S E T ;

View File

@ -63,6 +63,7 @@ memgraphCypherKeyword : cypherKeyword
| GRANT
| HEADER
| IDENTIFIED
| INSTANCE
| NODE_LABELS
| NULLIF
| IMPORT
@ -186,9 +187,9 @@ replicationQuery : setReplicationRole
| showReplicas
;
coordinatorQuery : registerCoordinatorServer
coordinatorQuery : registerInstanceOnCoordinator
| setInstanceToMain
| showReplicationCluster
| doFailover
;
triggerQuery : createTrigger
@ -252,8 +253,6 @@ transactionQueueQuery : showTransactions
showTransactions : SHOW TRANSACTIONS ;
doFailover : DO FAILOVER ;
terminateTransactions : TERMINATE TRANSACTIONS transactionIdList;
loadCsv : LOAD CSV FROM csvFile ( WITH | NO ) HEADER
@ -382,15 +381,14 @@ instanceName : symbolicName ;
socketAddress : literal ;
coordinatorSocketAddress : literal ;
replicationSocketAddress : literal ;
registerReplica : REGISTER REPLICA instanceName ( SYNC | ASYNC )
TO socketAddress ;
registerReplicaCoordinatorServer: REGISTER REPLICA instanceName ( ASYNC | SYNC ) TO socketAddress WITH COORDINATOR SERVER ON coordinatorSocketAddress ;
registerInstanceOnCoordinator : REGISTER INSTANCE instanceName ON coordinatorSocketAddress ( AS ASYNC ) ? WITH replicationSocketAddress ;
registerMainCoordinatorServer: REGISTER MAIN instanceName WITH COORDINATOR SERVER ON coordinatorSocketAddress ;
registerCoordinatorServer : registerMainCoordinatorServer | registerReplicaCoordinatorServer ;
setInstanceToMain : SET INSTANCE instanceName TO MAIN ;
dropReplica : DROP REPLICA instanceName ;

View File

@ -79,6 +79,7 @@ IMPORT : I M P O R T ;
INACTIVE : I N A C T I V E ;
IN_MEMORY_ANALYTICAL : I N UNDERSCORE M E M O R Y UNDERSCORE A N A L Y T I C A L ;
IN_MEMORY_TRANSACTIONAL : I N UNDERSCORE M E M O R Y UNDERSCORE T R A N S A C T I O N A L ;
INSTANCE : I N S T A N C E ;
ISOLATION : I S O L A T I O N ;
KAFKA : K A F K A ;
LABELS : L A B E L S ;

View File

@ -1,4 +1,4 @@
// Copyright 2023 Memgraph Ltd.
// Copyright 2024 Memgraph Ltd.
//
// Use of this software is governed by the Business Source License
// included in the file licenses/BSL.txt; by using this file, you agree to be bound by the terms of the Business Source
@ -218,7 +218,8 @@ const trie::Trie kKeywords = {"union",
"directory",
"lock",
"unlock",
"build"};
"build",
"instance"};
// Unicode codepoints that are allowed at the start of the unescaped name.
const std::bitset<kBitsetSize> kUnescapedNameAllowedStarts(

View File

@ -110,7 +110,6 @@
#ifdef MG_ENTERPRISE
#include "coordination/constants.hpp"
#include "coordination/coordinator_entity_info.hpp"
#endif
namespace memgraph::metrics {
@ -337,9 +336,9 @@ class ReplQueryHandler {
/// @throw QueryRuntimeException if an error ocurred.
ReplicationQuery::ReplicationRole ShowReplicationRole() const {
switch (handler_.GetRole()) {
case memgraph::replication::ReplicationRole::MAIN:
case memgraph::replication_coordination_glue::ReplicationRole::MAIN:
return ReplicationQuery::ReplicationRole::MAIN;
case memgraph::replication::ReplicationRole::REPLICA:
case memgraph::replication_coordination_glue::ReplicationRole::REPLICA:
return ReplicationQuery::ReplicationRole::REPLICA;
}
throw QueryRuntimeException("Couldn't show replication role - invalid role set!");
@ -462,11 +461,9 @@ class CoordQueryHandler final : public query::CoordinatorQueryHandler {
#ifdef MG_ENTERPRISE
/// @throw QueryRuntimeException if an error ocurred.
void RegisterReplicaCoordinatorServer(const std::string &replication_socket_address,
const std::string &coordinator_socket_address,
const std::chrono::seconds instance_check_frequency,
const std::string &instance_name,
CoordinatorQuery::SyncMode sync_mode) override {
void RegisterInstance(const std::string &coordinator_socket_address, const std::string &replication_socket_address,
const std::chrono::seconds instance_check_frequency, const std::string &instance_name,
CoordinatorQuery::SyncMode sync_mode) override {
const auto maybe_replication_ip_port =
io::network::Endpoint::ParseSocketOrAddress(replication_socket_address, std::nullopt);
if (!maybe_replication_ip_port) {
@ -487,7 +484,7 @@ class CoordQueryHandler final : public query::CoordinatorQueryHandler {
.replication_ip_address = replication_ip,
.replication_port = replication_port};
const auto coordinator_client_config =
auto coordinator_client_config =
coordination::CoordinatorClientConfig{.instance_name = instance_name,
.ip_address = coordinator_server_ip,
.port = coordinator_server_port,
@ -495,87 +492,49 @@ class CoordQueryHandler final : public query::CoordinatorQueryHandler {
.replication_client_info = repl_config,
.ssl = std::nullopt};
if (const auto ret = coordinator_handler_.RegisterReplicaOnCoordinator(coordinator_client_config); ret.HasError()) {
throw QueryRuntimeException("Couldn't register replica on coordinator!");
}
}
void RegisterMainCoordinatorServer(const std::string &coordinator_socket_address,
const std::chrono::seconds instance_check_frequency,
const std::string &instance_name) override {
const auto maybe_ip_and_port =
io::network::Endpoint::ParseSocketOrAddress(coordinator_socket_address, std::nullopt);
if (!maybe_ip_and_port) {
throw QueryRuntimeException("Invalid socket address!");
}
const auto [ip, port] = *maybe_ip_and_port;
const auto config = coordination::CoordinatorClientConfig{.instance_name = instance_name,
.ip_address = ip,
.port = port,
.health_check_frequency_sec = instance_check_frequency,
.ssl = std::nullopt};
if (const auto ret = coordinator_handler_.RegisterMainOnCoordinator(config); ret.HasError()) {
throw QueryRuntimeException("Couldn't register main on coordinator!");
}
}
/// @throw QueryRuntimeException if an error ocurred.
void DoFailover() const override {
if (!FLAGS_coordinator) {
throw QueryRuntimeException("Only coordinator can register coordinator server!");
}
auto status = coordinator_handler_.DoFailover();
auto status = coordinator_handler_.RegisterInstance(coordinator_client_config);
switch (status) {
using enum memgraph::dbms::DoFailoverStatus;
case ALL_REPLICAS_DOWN:
throw QueryRuntimeException("Failover aborted since all replicas are down!");
case MAIN_ALIVE:
throw QueryRuntimeException("Failover aborted since main is alive!");
case CLUSTER_UNINITIALIZED:
throw QueryRuntimeException("Failover aborted since cluster is uninitialized!");
using enum memgraph::coordination::RegisterInstanceCoordinatorStatus;
case NAME_EXISTS:
throw QueryRuntimeException("Couldn't register replica instance since instance with such name already exists!");
case END_POINT_EXISTS:
throw QueryRuntimeException(
"Couldn't register replica instance since instance with such endpoint already exists!");
case NOT_COORDINATOR:
throw QueryRuntimeException("Couldn't register replica instance since this instance is not a coordinator!");
case RPC_FAILED:
throw QueryRuntimeException(
"Couldn't register instance because setting instance to replica failed! Check logs on replica to find out "
"more "
"info!");
case SUCCESS:
break;
}
}
std::vector<MainReplicaStatus> ShowMainReplicaStatus(
const std::vector<coordination::CoordinatorEntityInfo> &replicas,
const std::unordered_map<std::string_view, bool> &health_check_replicas,
const std::optional<coordination::CoordinatorEntityInfo> &main,
const std::optional<coordination::CoordinatorEntityHealthInfo> &health_check_main) const override {
std::vector<MainReplicaStatus> result{};
result.reserve(replicas.size() + 1); // replicas + 1 main
std::ranges::transform(
replicas, std::back_inserter(result), [&health_check_replicas](const auto &replica) -> MainReplicaStatus {
return {replica.name, replica.endpoint.SocketAddress(), health_check_replicas.at(replica.name), false};
});
if (main) {
bool is_main_alive = health_check_main.has_value() ? health_check_main.value().alive : false;
result.emplace_back(main->name, main->endpoint.SocketAddress(), is_main_alive, true);
void SetInstanceToMain(const std::string &instance_name) override {
auto status = coordinator_handler_.SetInstanceToMain(instance_name);
switch (status) {
using enum memgraph::coordination::SetInstanceToMainCoordinatorStatus;
case NO_INSTANCE_WITH_NAME:
throw QueryRuntimeException("No instance with such name!");
case NOT_COORDINATOR:
throw QueryRuntimeException("Couldn't set replica instance to main since this instance is not a coordinator!");
case COULD_NOT_PROMOTE_TO_MAIN:
throw QueryRuntimeException(
"Couldn't set replica instance to main. Check coordinator and replica for more logs");
case SUCCESS:
break;
}
return result;
}
#endif
#ifdef MG_ENTERPRISE
std::vector<coordination::CoordinatorEntityInfo> ShowReplicasOnCoordinator() const override {
return coordinator_handler_.ShowReplicasOnCoordinator();
std::vector<coordination::CoordinatorInstanceStatus> ShowInstances() const override {
return coordinator_handler_.ShowInstances();
}
std::unordered_map<std::string_view, bool> PingReplicasOnCoordinator() const override {
return coordinator_handler_.PingReplicasOnCoordinator();
}
std::optional<coordination::CoordinatorEntityInfo> ShowMainOnCoordinator() const override {
return coordinator_handler_.ShowMainOnCoordinator();
}
std::optional<coordination::CoordinatorEntityHealthInfo> PingMainOnCoordinator() const override {
return coordinator_handler_.PingMainOnCoordinator();
}
#endif
private:
@ -890,10 +849,10 @@ Callback HandleReplicationQuery(ReplicationQuery *repl_query, const Parameters &
case ReplicationQuery::Action::SET_REPLICATION_ROLE: {
#ifdef MG_ENTERPRISE
if (FLAGS_coordinator) {
if (repl_query->role_ == ReplicationQuery::ReplicationRole::REPLICA) {
throw QueryRuntimeException("Coordinator cannot become a replica!");
}
throw QueryRuntimeException("Coordinator cannot become main!");
throw QueryRuntimeException("Coordinator can't set roles!");
}
if (FLAGS_coordinator_server_port) {
throw QueryRuntimeException("Can't set role manually on instance with coordinator server port.");
}
#endif
@ -938,6 +897,11 @@ Callback HandleReplicationQuery(ReplicationQuery *repl_query, const Parameters &
return callback;
}
case ReplicationQuery::Action::REGISTER_REPLICA: {
#ifdef MG_ENTERPRISE
if (FLAGS_coordinator_server_port) {
throw QueryRuntimeException("Can't register replica manually on instance with coordinator server port.");
}
#endif
const auto &name = repl_query->instance_name_;
const auto &sync_mode = repl_query->sync_mode_;
auto socket_address = repl_query->socket_address_->Accept(evaluator);
@ -954,6 +918,11 @@ Callback HandleReplicationQuery(ReplicationQuery *repl_query, const Parameters &
}
case ReplicationQuery::Action::DROP_REPLICA: {
#ifdef MG_ENTERPRISE
if (FLAGS_coordinator_server_port) {
throw QueryRuntimeException("Can't drop replica manually on instance with coordinator server port.");
}
#endif
const auto &name = repl_query->instance_name_;
callback.fn = [handler = ReplQueryHandler{dbms_handler}, name]() mutable {
handler.DropReplica(name);
@ -1026,7 +995,7 @@ Callback HandleCoordinatorQuery(CoordinatorQuery *coordinator_query, const Param
std::vector<Notification> *notifications) {
Callback callback;
switch (coordinator_query->action_) {
case CoordinatorQuery::Action::REGISTER_MAIN_COORDINATOR_SERVER: {
case CoordinatorQuery::Action::REGISTER_INSTANCE: {
if (!license::global_license_checker.IsEnterpriseValidFast()) {
throw QueryException("Trying to use enterprise feature without a valid license.");
}
@ -1045,11 +1014,14 @@ Callback HandleCoordinatorQuery(CoordinatorQuery *coordinator_query, const Param
auto evaluator = PrimitiveLiteralExpressionEvaluator{evaluation_context};
auto coordinator_socket_address_tv = coordinator_query->coordinator_socket_address_->Accept(evaluator);
auto replication_socket_address_tv = coordinator_query->replication_socket_address_->Accept(evaluator);
callback.fn = [handler = CoordQueryHandler{dbms_handler}, coordinator_socket_address_tv,
main_check_frequency = config.replication_replica_check_frequency,
instance_name = coordinator_query->instance_name_]() mutable {
handler.RegisterMainCoordinatorServer(std::string(coordinator_socket_address_tv.ValueString()),
main_check_frequency, instance_name);
replication_socket_address_tv, main_check_frequency = config.replication_replica_check_frequency,
instance_name = coordinator_query->instance_name_,
sync_mode = coordinator_query->sync_mode_]() mutable {
handler.RegisterInstance(std::string(coordinator_socket_address_tv.ValueString()),
std::string(replication_socket_address_tv.ValueString()), main_check_frequency,
instance_name, sync_mode);
return std::vector<std::vector<TypedValue>>();
};
@ -1060,7 +1032,7 @@ Callback HandleCoordinatorQuery(CoordinatorQuery *coordinator_query, const Param
return callback;
#endif
}
case CoordinatorQuery::Action::REGISTER_REPLICA_COORDINATOR_SERVER: {
case CoordinatorQuery::Action::SET_INSTANCE_TO_MAIN: {
if (!license::global_license_checker.IsEnterpriseValidFast()) {
throw QueryException("Trying to use enterprise feature without a valid license.");
}
@ -1077,22 +1049,13 @@ Callback HandleCoordinatorQuery(CoordinatorQuery *coordinator_query, const Param
// the argument to Callback.
EvaluationContext evaluation_context{.timestamp = QueryTimestamp(), .parameters = parameters};
auto evaluator = PrimitiveLiteralExpressionEvaluator{evaluation_context};
auto coordinator_socket_address_tv = coordinator_query->coordinator_socket_address_->Accept(evaluator);
auto replication_socket_address_tv = coordinator_query->socket_address_->Accept(evaluator);
callback.fn = [handler = CoordQueryHandler{dbms_handler}, coordinator_socket_address_tv,
replication_socket_address_tv, main_check_frequency = config.replication_replica_check_frequency,
instance_name = coordinator_query->instance_name_,
sync_mode = coordinator_query->sync_mode_]() mutable {
handler.RegisterReplicaCoordinatorServer(std::string(replication_socket_address_tv.ValueString()),
std::string(coordinator_socket_address_tv.ValueString()),
main_check_frequency, instance_name, sync_mode);
callback.fn = [handler = CoordQueryHandler{dbms_handler},
instance_name = coordinator_query->instance_name_]() mutable {
handler.SetInstanceToMain(instance_name);
return std::vector<std::vector<TypedValue>>();
};
notifications->emplace_back(
SeverityLevel::INFO, NotificationCode::REGISTER_COORDINATOR_SERVER,
fmt::format("Coordinator has registered coordinator server on {} for instance {}.",
coordinator_socket_address_tv.ValueString(), coordinator_query->instance_name_));
return callback;
#endif
}
@ -1112,57 +1075,19 @@ Callback HandleCoordinatorQuery(CoordinatorQuery *coordinator_query, const Param
callback.header = {"name", "socket_address", "alive", "role"};
callback.fn = [handler = CoordQueryHandler{dbms_handler}, replica_nfields = callback.header.size()]() mutable {
const auto main = handler.ShowMainOnCoordinator();
const auto health_check_main = main ? handler.PingMainOnCoordinator() : std::nullopt;
const auto result_status = handler.ShowMainReplicaStatus(
handler.ShowReplicasOnCoordinator(), handler.PingReplicasOnCoordinator(), main, health_check_main);
auto const instances = handler.ShowInstances();
std::vector<std::vector<TypedValue>> result{};
result.reserve(result_status.size());
result.reserve(result.size());
std::ranges::transform(result_status, std::back_inserter(result),
std::ranges::transform(instances, std::back_inserter(result),
[](const auto &status) -> std::vector<TypedValue> {
return {TypedValue{status.name}, TypedValue{status.socket_address},
TypedValue{status.alive}, TypedValue{status.is_main ? "main" : "replica"}};
return {TypedValue{status.instance_name}, TypedValue{status.socket_address},
TypedValue{status.is_alive}, TypedValue{status.replication_role}};
});
return result;
};
return callback;
#endif
}
case CoordinatorQuery::Action::DO_FAILOVER: {
if (!license::global_license_checker.IsEnterpriseValidFast()) {
throw QueryException("Trying to use enterprise feature without a valid license.");
}
#ifdef MG_ENTERPRISE
if constexpr (!coordination::allow_ha) {
throw QueryRuntimeException(
"High availability is experimental feature. Please set MG_EXPERIMENTAL_HIGH_AVAILABILITY compile flag to "
"be able to use this functionality.");
}
if (!FLAGS_coordinator) {
throw QueryRuntimeException("Only coordinator can run DO FAILOVER!");
}
callback.header = {"name", "socket_address", "alive", "role"};
callback.fn = [handler = CoordQueryHandler{dbms_handler}]() mutable {
handler.DoFailover();
const auto main = handler.ShowMainOnCoordinator();
const auto health_check_main = main ? handler.PingMainOnCoordinator() : std::nullopt;
const auto result_status = handler.ShowMainReplicaStatus(
handler.ShowReplicasOnCoordinator(), handler.PingReplicasOnCoordinator(), main, health_check_main);
std::vector<std::vector<TypedValue>> result{};
result.reserve(result_status.size());
std::ranges::transform(result_status, std::back_inserter(result),
[](const auto &status) -> std::vector<TypedValue> {
return {TypedValue{status.name}, TypedValue{status.socket_address},
TypedValue{status.alive}, TypedValue{status.is_main ? "main" : "replica"}};
});
return result;
};
notifications->emplace_back(SeverityLevel::INFO, NotificationCode::DO_FAILOVER,
"DO FAILOVER called on coordinator.");
return callback;
#endif
}
return callback;
@ -3157,7 +3082,7 @@ PreparedQuery PrepareEdgeImportModeQuery(ParsedQuery parsed_query, CurrentDB &cu
}
PreparedQuery PrepareCreateSnapshotQuery(ParsedQuery parsed_query, bool in_explicit_transaction, CurrentDB &current_db,
replication::ReplicationRole replication_role) {
replication_coordination_glue::ReplicationRole replication_role) {
if (in_explicit_transaction) {
throw CreateSnapshotInMulticommandTxException();
}

View File

@ -15,7 +15,6 @@
#include <gflags/gflags.h>
#include "coordination/coordinator_entity_info.hpp"
#include "dbms/database.hpp"
#include "dbms/dbms_handler.hpp"
#include "memory/query_memory_control.hpp"
@ -53,6 +52,10 @@
#include "utils/timer.hpp"
#include "utils/tsc.hpp"
#ifdef MG_ENTERPRISE
#include "coordination/coordinator_instance_status.hpp"
#endif
namespace memgraph::metrics {
extern const Event FailedQuery;
extern const Event FailedPrepare;
@ -93,47 +96,27 @@ class CoordinatorQueryHandler {
#ifdef MG_ENTERPRISE
struct MainReplicaStatus {
std::string_view name;
std::string socket_address;
std::string_view socket_address;
bool alive;
bool is_main;
MainReplicaStatus(std::string_view name, std::string socket_address, bool alive, bool is_main)
: name{name}, socket_address{std::move(socket_address)}, alive{alive}, is_main{is_main} {}
MainReplicaStatus(std::string_view name, std::string_view socket_address, bool alive, bool is_main)
: name{name}, socket_address{socket_address}, alive{alive}, is_main{is_main} {}
};
#endif
#ifdef MG_ENTERPRISE
/// @throw QueryRuntimeException if an error ocurred.
virtual void RegisterReplicaCoordinatorServer(const std::string &replication_socket_address,
const std::string &coordinator_socket_address,
const std::chrono::seconds instance_check_frequency,
const std::string &instance_name,
CoordinatorQuery::SyncMode sync_mode) = 0;
virtual void RegisterMainCoordinatorServer(const std::string &socket_address,
const std::chrono::seconds instance_check_frequency,
const std::string &instance_name) = 0;
virtual void RegisterInstance(const std::string &coordinator_socket_address,
const std::string &replication_socket_address,
const std::chrono::seconds instance_check_frequency, const std::string &instance_name,
CoordinatorQuery::SyncMode sync_mode) = 0;
/// @throw QueryRuntimeException if an error ocurred.
virtual std::vector<coordination::CoordinatorEntityInfo> ShowReplicasOnCoordinator() const = 0;
virtual void SetInstanceToMain(const std::string &instance_name) = 0;
/// @throw QueryRuntimeException if an error ocurred.
virtual std::optional<coordination::CoordinatorEntityInfo> ShowMainOnCoordinator() const = 0;
/// @throw QueryRuntimeException if an error ocurred.
virtual std::unordered_map<std::string_view, bool> PingReplicasOnCoordinator() const = 0;
/// @throw QueryRuntimeException if an error ocurred.
virtual std::optional<coordination::CoordinatorEntityHealthInfo> PingMainOnCoordinator() const = 0;
/// @throw QueryRuntimeException if an error ocurred.
virtual void DoFailover() const = 0;
/// @throw QueryRuntimeException if an error ocurred.
virtual std::vector<MainReplicaStatus> ShowMainReplicaStatus(
const std::vector<coordination::CoordinatorEntityInfo> &replicas,
const std::unordered_map<std::string_view, bool> &health_check_replicas,
const std::optional<coordination::CoordinatorEntityInfo> &main,
const std::optional<coordination::CoordinatorEntityHealthInfo> &health_check_main) const = 0;
virtual std::vector<coordination::CoordinatorInstanceStatus> ShowInstances() const = 0;
#endif
};

View File

@ -69,8 +69,6 @@ constexpr std::string_view GetCodeString(const NotificationCode code) {
#ifdef MG_ENTERPRISE
case NotificationCode::REGISTER_COORDINATOR_SERVER:
return "RegisterCoordinatorServer"sv;
case NotificationCode::DO_FAILOVER:
return "DoFailover"sv;
#endif
case NotificationCode::REPLICA_PORT_WARNING:
return "ReplicaPortWarning"sv;

View File

@ -44,7 +44,6 @@ enum class NotificationCode : uint8_t {
REGISTER_REPLICA,
#ifdef MG_ENTERPRISE
REGISTER_COORDINATOR_SERVER,
DO_FAILOVER,
#endif
SET_REPLICA,
START_STREAM,

View File

@ -5,7 +5,6 @@ target_sources(mg-replication
include/replication/state.hpp
include/replication/epoch.hpp
include/replication/config.hpp
include/replication/role.hpp
include/replication/status.hpp
include/replication/messages.hpp
include/replication/replication_client.hpp

View File

@ -40,7 +40,7 @@ struct ReplicationClientConfig {
friend bool operator==(const SSL &, const SSL &) = default;
};
std::optional<SSL> ssl;
std::optional<SSL> ssl{};
friend bool operator==(ReplicationClientConfig const &, ReplicationClientConfig const &) = default;
};

View File

@ -15,8 +15,8 @@
#include "replication/config.hpp"
#include "replication/epoch.hpp"
#include "replication/replication_client.hpp"
#include "replication/role.hpp"
#include "replication_coordination_glue/mode.hpp"
#include "replication_coordination_glue/role.hpp"
#include "replication_server.hpp"
#include "status.hpp"
#include "utils/result.hpp"
@ -32,7 +32,8 @@ namespace memgraph::replication {
enum class RolePersisted : uint8_t { UNKNOWN_OR_NO, YES };
enum class RegisterReplicaError : uint8_t { NAME_EXISTS, END_POINT_EXISTS, COULD_NOT_BE_PERSISTED, NOT_MAIN, SUCCESS };
// TODO: (andi) Rename Error to Status
enum class RegisterReplicaError : uint8_t { NAME_EXISTS, ENDPOINT_EXISTS, COULD_NOT_BE_PERSISTED, NOT_MAIN, SUCCESS };
struct RoleMainData {
RoleMainData() = default;
@ -72,12 +73,13 @@ struct ReplicationState {
using FetchReplicationResult_t = utils::BasicResult<FetchReplicationError, ReplicationData_t>;
auto FetchReplicationData() -> FetchReplicationResult_t;
auto GetRole() const -> ReplicationRole {
return std::holds_alternative<RoleReplicaData>(replication_data_) ? ReplicationRole::REPLICA
: ReplicationRole::MAIN;
auto GetRole() const -> replication_coordination_glue::ReplicationRole {
return std::holds_alternative<RoleReplicaData>(replication_data_)
? replication_coordination_glue::ReplicationRole::REPLICA
: replication_coordination_glue::ReplicationRole::MAIN;
}
bool IsMain() const { return GetRole() == ReplicationRole::MAIN; }
bool IsReplica() const { return GetRole() == ReplicationRole::REPLICA; }
bool IsMain() const { return GetRole() == replication_coordination_glue::ReplicationRole::MAIN; }
bool IsReplica() const { return GetRole() == replication_coordination_glue::ReplicationRole::REPLICA; }
bool HasDurability() const { return nullptr != durability_; }
@ -92,7 +94,6 @@ struct ReplicationState {
utils::BasicResult<RegisterReplicaError, ReplicationClient *> RegisterReplica(const ReplicationClientConfig &config);
bool SetReplicationRoleMain();
bool SetReplicationRoleReplica(const ReplicationServerConfig &config);
private:

View File

@ -21,7 +21,7 @@
#include "replication/config.hpp"
#include "replication/epoch.hpp"
#include "replication/role.hpp"
#include "replication_coordination_glue/role.hpp"
namespace memgraph::replication::durability {

View File

@ -260,7 +260,7 @@ utils::BasicResult<RegisterReplicaError, ReplicationClient *> ReplicationState::
return std::any_of(replicas.begin(), replicas.end(), endpoint_matches);
};
if (endpoint_check(mainData.registered_replicas_)) {
return RegisterReplicaError::END_POINT_EXISTS;
return RegisterReplicaError::ENDPOINT_EXISTS;
}
// Durability
@ -279,4 +279,5 @@ utils::BasicResult<RegisterReplicaError, ReplicationClient *> ReplicationState::
}
return res;
}
} // namespace memgraph::replication

View File

@ -29,12 +29,14 @@ constexpr auto *kVersion = "durability_version";
void to_json(nlohmann::json &j, const ReplicationRoleEntry &p) {
auto processMAIN = [&](MainRole const &main) {
j = nlohmann::json{{kVersion, p.version}, {kReplicationRole, ReplicationRole::MAIN}, {kEpoch, main.epoch.id()}};
j = nlohmann::json{{kVersion, p.version},
{kReplicationRole, replication_coordination_glue::ReplicationRole::MAIN},
{kEpoch, main.epoch.id()}};
};
auto processREPLICA = [&](ReplicaRole const &replica) {
j = nlohmann::json{
{kVersion, p.version},
{kReplicationRole, ReplicationRole::REPLICA},
{kReplicationRole, replication_coordination_glue::ReplicationRole::REPLICA},
{kIpAddress, replica.config.ip_address},
{kPort, replica.config.port}
// TODO: SSL
@ -47,17 +49,17 @@ void from_json(const nlohmann::json &j, ReplicationRoleEntry &p) {
// This value did not exist in V1, hence default DurabilityVersion::V1
DurabilityVersion version = j.value(kVersion, DurabilityVersion::V1);
// NOLINTNEXTLINE(cppcoreguidelines-init-variables)
ReplicationRole role;
replication_coordination_glue::ReplicationRole role;
j.at(kReplicationRole).get_to(role);
switch (role) {
case ReplicationRole::MAIN: {
case replication_coordination_glue::ReplicationRole::MAIN: {
auto json_epoch = j.value(kEpoch, std::string{});
auto epoch = ReplicationEpoch{};
if (!json_epoch.empty()) epoch.SetEpoch(json_epoch);
p = ReplicationRoleEntry{.version = version, .role = MainRole{.epoch = std::move(epoch)}};
break;
}
case ReplicationRole::REPLICA: {
case memgraph::replication_coordination_glue::ReplicationRole::REPLICA: {
std::string ip_address;
// NOLINTNEXTLINE(cppcoreguidelines-init-variables)
uint16_t port;

View File

@ -5,6 +5,7 @@ target_sources(mg-repl_coord_glue
PUBLIC
messages.hpp
mode.hpp
role.hpp
PRIVATE
messages.cpp

View File

@ -12,8 +12,8 @@
#pragma once
#include <cstdint>
namespace memgraph::replication {
namespace memgraph::replication_coordination_glue {
// TODO: figure out a way of ensuring that usage of this type is never uninitialed/defaulted incorrectly to MAIN
enum class ReplicationRole : uint8_t { MAIN, REPLICA };
} // namespace memgraph::replication
} // namespace memgraph::replication_coordination_glue

View File

@ -288,7 +288,8 @@ DiskStorage::~DiskStorage() {
DiskStorage::DiskAccessor::DiskAccessor(auto tag, DiskStorage *storage, IsolationLevel isolation_level,
StorageMode storage_mode)
: Accessor(tag, storage, isolation_level, storage_mode, memgraph::replication::ReplicationRole::MAIN) {
: Accessor(tag, storage, isolation_level, storage_mode,
memgraph::replication_coordination_glue::ReplicationRole::MAIN) {
rocksdb::WriteOptions write_options;
auto txOptions = rocksdb::TransactionOptions{.set_snapshot = true};
transaction_.disk_transaction_ = storage->kvstore_->db_->BeginTransaction(write_options, txOptions);
@ -837,7 +838,8 @@ StorageInfo DiskStorage::GetBaseInfo(bool /* unused */) {
return info;
}
StorageInfo DiskStorage::GetInfo(bool force_dir, memgraph::replication::ReplicationRole replication_role) {
StorageInfo DiskStorage::GetInfo(bool force_dir,
memgraph::replication_coordination_glue::ReplicationRole replication_role) {
StorageInfo info = GetBaseInfo(force_dir);
{
auto access = Access(replication_role);
@ -2007,7 +2009,7 @@ UniqueConstraints::DeletionStatus DiskStorage::DiskAccessor::DropUniqueConstrain
}
Transaction DiskStorage::CreateTransaction(IsolationLevel isolation_level, StorageMode storage_mode,
memgraph::replication::ReplicationRole /*is_main*/) {
memgraph::replication_coordination_glue::ReplicationRole /*is_main*/) {
/// We acquire the transaction engine lock here because we access (and
/// modify) the transaction engine variables (`transaction_id` and
/// `timestamp`) below.
@ -2032,8 +2034,9 @@ uint64_t DiskStorage::CommitTimestamp(const std::optional<uint64_t> desired_comm
return *desired_commit_timestamp;
}
std::unique_ptr<Storage::Accessor> DiskStorage::Access(memgraph::replication::ReplicationRole /*replication_role*/,
std::optional<IsolationLevel> override_isolation_level) {
std::unique_ptr<Storage::Accessor> DiskStorage::Access(
memgraph::replication_coordination_glue::ReplicationRole /*replication_role*/,
std::optional<IsolationLevel> override_isolation_level) {
auto isolation_level = override_isolation_level.value_or(isolation_level_);
if (isolation_level != IsolationLevel::SNAPSHOT_ISOLATION) {
throw utils::NotYetImplemented("Disk storage supports only SNAPSHOT isolation level.");
@ -2042,7 +2045,7 @@ std::unique_ptr<Storage::Accessor> DiskStorage::Access(memgraph::replication::Re
new DiskAccessor{Storage::Accessor::shared_access, this, isolation_level, storage_mode_});
}
std::unique_ptr<Storage::Accessor> DiskStorage::UniqueAccess(
memgraph::replication::ReplicationRole /*replication_role*/,
memgraph::replication_coordination_glue::ReplicationRole /*replication_role*/,
std::optional<IsolationLevel> override_isolation_level) {
auto isolation_level = override_isolation_level.value_or(isolation_level_);
if (isolation_level != IsolationLevel::SNAPSHOT_ISOLATION) {

View File

@ -176,11 +176,11 @@ class DiskStorage final : public Storage {
};
using Storage::Access;
std::unique_ptr<Accessor> Access(memgraph::replication::ReplicationRole replication_role,
std::unique_ptr<Accessor> Access(memgraph::replication_coordination_glue::ReplicationRole replication_role,
std::optional<IsolationLevel> override_isolation_level) override;
using Storage::UniqueAccess;
std::unique_ptr<Accessor> UniqueAccess(memgraph::replication::ReplicationRole replication_role,
std::unique_ptr<Accessor> UniqueAccess(memgraph::replication_coordination_glue::ReplicationRole replication_role,
std::optional<IsolationLevel> override_isolation_level) override;
/// Flushing methods
@ -285,7 +285,7 @@ class DiskStorage final : public Storage {
RocksDBStorage *GetRocksDBStorage() const { return kvstore_.get(); }
Transaction CreateTransaction(IsolationLevel isolation_level, StorageMode storage_mode,
memgraph::replication::ReplicationRole replication_role) override;
memgraph::replication_coordination_glue::ReplicationRole replication_role) override;
void SetEdgeImportMode(EdgeImportMode edge_import_status);
@ -308,7 +308,8 @@ class DiskStorage final : public Storage {
PropertyId property);
StorageInfo GetBaseInfo(bool force_directory) override;
StorageInfo GetInfo(bool force_directory, memgraph::replication::ReplicationRole replication_role) override;
StorageInfo GetInfo(bool force_directory,
memgraph::replication_coordination_glue::ReplicationRole replication_role) override;
void FreeMemory(std::unique_lock<utils::ResourceLock> /*lock*/) override {}

View File

@ -178,7 +178,7 @@ InMemoryStorage::~InMemoryStorage() {
InMemoryStorage::InMemoryAccessor::InMemoryAccessor(auto tag, InMemoryStorage *storage, IsolationLevel isolation_level,
StorageMode storage_mode,
memgraph::replication::ReplicationRole replication_role)
memgraph::replication_coordination_glue::ReplicationRole replication_role)
: Accessor(tag, storage, isolation_level, storage_mode, replication_role),
config_(storage->config_.salient.items) {}
InMemoryStorage::InMemoryAccessor::InMemoryAccessor(InMemoryAccessor &&other) noexcept
@ -1280,8 +1280,9 @@ VerticesIterable InMemoryStorage::InMemoryAccessor::Vertices(
mem_label_property_index->Vertices(label, property, lower_bound, upper_bound, view, storage_, &transaction_));
}
Transaction InMemoryStorage::CreateTransaction(IsolationLevel isolation_level, StorageMode storage_mode,
memgraph::replication::ReplicationRole replication_role) {
Transaction InMemoryStorage::CreateTransaction(
IsolationLevel isolation_level, StorageMode storage_mode,
memgraph::replication_coordination_glue::ReplicationRole replication_role) {
// We acquire the transaction engine lock here because we access (and
// modify) the transaction engine variables (`transaction_id` and
// `timestamp`) below.
@ -1296,7 +1297,7 @@ Transaction InMemoryStorage::CreateTransaction(IsolationLevel isolation_level, S
// of any query on replica to the last commited transaction
// which is timestamp_ as only commit of transaction with writes
// can change the value of it.
if (replication_role == memgraph::replication::ReplicationRole::MAIN) {
if (replication_role == memgraph::replication_coordination_glue::ReplicationRole::MAIN) {
start_timestamp = timestamp_++;
} else {
start_timestamp = timestamp_;
@ -1683,7 +1684,8 @@ StorageInfo InMemoryStorage::GetBaseInfo(bool force_directory) {
return info;
}
StorageInfo InMemoryStorage::GetInfo(bool force_directory, memgraph::replication::ReplicationRole replication_role) {
StorageInfo InMemoryStorage::GetInfo(bool force_directory,
memgraph::replication_coordination_glue::ReplicationRole replication_role) {
StorageInfo info = GetBaseInfo(force_directory);
{
auto access = Access(replication_role); // TODO: override isolation level?
@ -2004,15 +2006,15 @@ void InMemoryStorage::AppendToWalDataDefinition(durability::StorageMetadataOpera
}
utils::BasicResult<InMemoryStorage::CreateSnapshotError> InMemoryStorage::CreateSnapshot(
memgraph::replication::ReplicationRole replication_role) {
if (replication_role == memgraph::replication::ReplicationRole::REPLICA) {
memgraph::replication_coordination_glue::ReplicationRole replication_role) {
if (replication_role == memgraph::replication_coordination_glue::ReplicationRole::REPLICA) {
return InMemoryStorage::CreateSnapshotError::DisabledForReplica;
}
auto const &epoch = repl_storage_state_.epoch_;
auto snapshot_creator = [this, &epoch]() {
utils::Timer timer;
auto transaction = CreateTransaction(IsolationLevel::SNAPSHOT_ISOLATION, storage_mode_,
memgraph::replication::ReplicationRole::MAIN);
memgraph::replication_coordination_glue::ReplicationRole::MAIN);
durability::CreateSnapshot(this, &transaction, recovery_.snapshot_directory_, recovery_.wal_directory_, &vertices_,
&edges_, uuid_, epoch, repl_storage_state_.history, &file_retainer_);
// Finalize snapshot transaction.
@ -2100,14 +2102,16 @@ utils::FileRetainer::FileLockerAccessor::ret_type InMemoryStorage::UnlockPath()
return true;
}
std::unique_ptr<Storage::Accessor> InMemoryStorage::Access(memgraph::replication::ReplicationRole replication_role,
std::optional<IsolationLevel> override_isolation_level) {
std::unique_ptr<Storage::Accessor> InMemoryStorage::Access(
memgraph::replication_coordination_glue::ReplicationRole replication_role,
std::optional<IsolationLevel> override_isolation_level) {
return std::unique_ptr<InMemoryAccessor>(new InMemoryAccessor{Storage::Accessor::shared_access, this,
override_isolation_level.value_or(isolation_level_),
storage_mode_, replication_role});
}
std::unique_ptr<Storage::Accessor> InMemoryStorage::UniqueAccess(
memgraph::replication::ReplicationRole replication_role, std::optional<IsolationLevel> override_isolation_level) {
memgraph::replication_coordination_glue::ReplicationRole replication_role,
std::optional<IsolationLevel> override_isolation_level) {
return std::unique_ptr<InMemoryAccessor>(new InMemoryAccessor{Storage::Accessor::unique_access, this,
override_isolation_level.value_or(isolation_level_),
storage_mode_, replication_role});

View File

@ -73,7 +73,8 @@ class InMemoryStorage final : public Storage {
friend class InMemoryStorage;
explicit InMemoryAccessor(auto tag, InMemoryStorage *storage, IsolationLevel isolation_level,
StorageMode storage_mode, memgraph::replication::ReplicationRole replication_role);
StorageMode storage_mode,
memgraph::replication_coordination_glue::ReplicationRole replication_role);
public:
InMemoryAccessor(const InMemoryAccessor &) = delete;
@ -322,10 +323,10 @@ class InMemoryStorage final : public Storage {
};
using Storage::Access;
std::unique_ptr<Accessor> Access(memgraph::replication::ReplicationRole replication_role,
std::unique_ptr<Accessor> Access(memgraph::replication_coordination_glue::ReplicationRole replication_role,
std::optional<IsolationLevel> override_isolation_level) override;
using Storage::UniqueAccess;
std::unique_ptr<Accessor> UniqueAccess(memgraph::replication::ReplicationRole replication_role,
std::unique_ptr<Accessor> UniqueAccess(memgraph::replication_coordination_glue::ReplicationRole replication_role,
std::optional<IsolationLevel> override_isolation_level) override;
void FreeMemory(std::unique_lock<utils::ResourceLock> main_guard) override;
@ -335,12 +336,12 @@ class InMemoryStorage final : public Storage {
utils::FileRetainer::FileLockerAccessor::ret_type UnlockPath();
utils::BasicResult<InMemoryStorage::CreateSnapshotError> CreateSnapshot(
memgraph::replication::ReplicationRole replication_role);
memgraph::replication_coordination_glue::ReplicationRole replication_role);
void CreateSnapshotHandler(std::function<utils::BasicResult<InMemoryStorage::CreateSnapshotError>()> cb);
Transaction CreateTransaction(IsolationLevel isolation_level, StorageMode storage_mode,
memgraph::replication::ReplicationRole replication_role) override;
memgraph::replication_coordination_glue::ReplicationRole replication_role) override;
void SetStorageMode(StorageMode storage_mode);
@ -365,7 +366,8 @@ class InMemoryStorage final : public Storage {
void FinalizeWalFile();
StorageInfo GetBaseInfo(bool force_directory) override;
StorageInfo GetInfo(bool force_directory, memgraph::replication::ReplicationRole replication_role) override;
StorageInfo GetInfo(bool force_directory,
memgraph::replication_coordination_glue::ReplicationRole replication_role) override;
/// Return true in all cases excepted if any sync replicas have not sent confirmation.
[[nodiscard]] bool AppendToWal(const Transaction &transaction, uint64_t final_commit_timestamp,

View File

@ -67,6 +67,7 @@ void ReplicationStorageClient::UpdateReplicaState(Storage *storage, DatabaseAcce
"now hold unique data. Please resolve data conflicts and start the "
"replication on a clean instance.",
client_.name_, client_.name_, client_.name_);
// TODO: (andi) Talk about renaming MAYBE_BEHIND to branching
// State not updated, hence in MAYBE_BEHIND state
return;
}

View File

@ -49,7 +49,8 @@ Storage::Storage(Config config, StorageMode storage_mode)
}
Storage::Accessor::Accessor(SharedAccess /* tag */, Storage *storage, IsolationLevel isolation_level,
StorageMode storage_mode, memgraph::replication::ReplicationRole replication_role)
StorageMode storage_mode,
memgraph::replication_coordination_glue::ReplicationRole replication_role)
: storage_(storage),
// The lock must be acquired before creating the transaction object to
// prevent freshly created transactions from dangling in an active state
@ -61,7 +62,8 @@ Storage::Accessor::Accessor(SharedAccess /* tag */, Storage *storage, IsolationL
creation_storage_mode_(storage_mode) {}
Storage::Accessor::Accessor(UniqueAccess /* tag */, Storage *storage, IsolationLevel isolation_level,
StorageMode storage_mode, memgraph::replication::ReplicationRole replication_role)
StorageMode storage_mode,
memgraph::replication_coordination_glue::ReplicationRole replication_role)
: storage_(storage),
// The lock must be acquired before creating the transaction object to
// prevent freshly created transactions from dangling in an active state

View File

@ -145,9 +145,9 @@ class Storage {
} unique_access;
Accessor(SharedAccess /* tag */, Storage *storage, IsolationLevel isolation_level, StorageMode storage_mode,
memgraph::replication::ReplicationRole replication_role);
memgraph::replication_coordination_glue::ReplicationRole replication_role);
Accessor(UniqueAccess /* tag */, Storage *storage, IsolationLevel isolation_level, StorageMode storage_mode,
memgraph::replication::ReplicationRole replication_role);
memgraph::replication_coordination_glue::ReplicationRole replication_role);
Accessor(const Accessor &) = delete;
Accessor &operator=(const Accessor &) = delete;
Accessor &operator=(Accessor &&other) = delete;
@ -328,16 +328,17 @@ class Storage {
void FreeMemory() { FreeMemory({}); }
virtual std::unique_ptr<Accessor> Access(memgraph::replication::ReplicationRole replication_role,
virtual std::unique_ptr<Accessor> Access(memgraph::replication_coordination_glue::ReplicationRole replication_role,
std::optional<IsolationLevel> override_isolation_level) = 0;
std::unique_ptr<Accessor> Access(memgraph::replication::ReplicationRole replication_role) {
std::unique_ptr<Accessor> Access(memgraph::replication_coordination_glue::ReplicationRole replication_role) {
return Access(replication_role, {});
}
virtual std::unique_ptr<Accessor> UniqueAccess(memgraph::replication::ReplicationRole replication_role,
std::optional<IsolationLevel> override_isolation_level) = 0;
std::unique_ptr<Accessor> UniqueAccess(memgraph::replication::ReplicationRole replication_role) {
virtual std::unique_ptr<Accessor> UniqueAccess(
memgraph::replication_coordination_glue::ReplicationRole replication_role,
std::optional<IsolationLevel> override_isolation_level) = 0;
std::unique_ptr<Accessor> UniqueAccess(memgraph::replication_coordination_glue::ReplicationRole replication_role) {
return UniqueAccess(replication_role, {});
}
@ -356,10 +357,11 @@ class Storage {
return GetBaseInfo(force_dir);
}
virtual StorageInfo GetInfo(bool force_directory, memgraph::replication::ReplicationRole replication_role) = 0;
virtual StorageInfo GetInfo(bool force_directory,
memgraph::replication_coordination_glue::ReplicationRole replication_role) = 0;
virtual Transaction CreateTransaction(IsolationLevel isolation_level, StorageMode storage_mode,
memgraph::replication::ReplicationRole replication_role) = 0;
memgraph::replication_coordination_glue::ReplicationRole replication_role) = 0;
virtual void PrepareForNewEpoch() = 0;

View File

@ -32,7 +32,7 @@ namespace memgraph::utils {
* void long_function() {
* resource.enable();
* OnScopeExit on_exit([&resource] { resource.disable(); });
* // long block of code, might trow an exception
* // long block of code, might throw an exception
* }
*/
template <typename Callable>

View File

@ -1,4 +1,4 @@
// Copyright 2023 Memgraph Ltd.
// Copyright 2024 Memgraph Ltd.
//
// Use of this software is governed by the Business Source License
// included in the file licenses/BSL.txt; by using this file, you agree to be bound by the terms of the Business Source
@ -58,28 +58,40 @@ class Scheduler {
// the start of the program. Since Server will log some messages on
// the program start we let him log first and we make sure by first
// waiting that funcion f will not log before it.
// Check for pause also.
std::unique_lock<std::mutex> lk(mutex_);
auto now = std::chrono::system_clock::now();
start_time += pause;
if (start_time > now) {
condition_variable_.wait_until(lk, start_time, [&] { return is_working_.load() == false; });
condition_variable_.wait_until(lk, start_time, [&] { return !is_working_.load(); });
} else {
start_time = now;
}
pause_cv_.wait(lk, [&] { return !is_paused_.load(); });
if (!is_working_) break;
f();
}
});
}
void Resume() {
is_paused_.store(false);
pause_cv_.notify_one();
}
void Pause() { is_paused_.store(true); }
/**
* @brief Stops the thread execution. This is a blocking call and may take as
* much time as one call to the function given previously to Run takes.
* @throw std::system_error
*/
void Stop() {
is_paused_.store(false);
is_working_.store(false);
pause_cv_.notify_one();
condition_variable_.notify_one();
if (thread_.joinable()) thread_.join();
}
@ -97,6 +109,16 @@ class Scheduler {
*/
std::atomic<bool> is_working_{false};
/**
* Variable is true when thread is paused.
*/
std::atomic<bool> is_paused_{false};
/*
* Wait until the thread is resumed.
*/
std::condition_variable pause_cv_;
/**
* Mutex used to synchronize threads using condition variable.
*/

View File

@ -97,6 +97,8 @@ enum class TypeId : uint64_t {
// Coordinator
COORD_FAILOVER_REQ,
COORD_FAILOVER_RES,
COORD_SET_REPL_MAIN_REQ,
COORD_SET_REPL_MAIN_RES,
// AST
AST_LABELIX = 3000,

View File

@ -17,7 +17,7 @@
#include "storage/v2/inmemory/storage.hpp"
#include "storage/v2/storage.hpp"
using memgraph::replication::ReplicationRole;
using memgraph::replication_coordination_glue::ReplicationRole;
// The following classes are wrappers for memgraph::utils::MemoryResource, so that we can
// use BENCHMARK_TEMPLATE

View File

@ -33,7 +33,7 @@
#include "query/interpreter.hpp"
#include "storage/v2/inmemory/storage.hpp"
using memgraph::replication::ReplicationRole;
using memgraph::replication_coordination_glue::ReplicationRole;
// The following classes are wrappers for memgraph::utils::MemoryResource, so that we can
// use BENCHMARK_TEMPLATE

View File

@ -20,7 +20,7 @@
#include "query/plan/vertex_count_cache.hpp"
#include "storage/v2/inmemory/storage.hpp"
using memgraph::replication::ReplicationRole;
using memgraph::replication_coordination_glue::ReplicationRole;
// Add chained MATCH (node1) -- (node2), MATCH (node2) -- (node3) ... clauses.
static memgraph::query::CypherQuery *AddChainedMatches(int num_matches, memgraph::query::AstStorage &storage) {

View File

@ -17,7 +17,7 @@
#include "storage/v2/storage.hpp"
#include "utils/timer.hpp"
using memgraph::replication::ReplicationRole;
using memgraph::replication_coordination_glue::ReplicationRole;
// This benchmark should be run for a fixed amount of time that is
// large compared to GC interval to make the output relevant.

View File

@ -17,7 +17,7 @@
#include "storage/v2/storage.hpp"
#include "utils/timer.hpp"
using memgraph::replication::ReplicationRole;
using memgraph::replication_coordination_glue::ReplicationRole;
// This benchmark should be run for a fixed amount of time that is
// large compared to GC interval to make the output relevant.

View File

@ -19,7 +19,7 @@
#include "storage/v2/storage_error.hpp"
#include "utils/thread.hpp"
using memgraph::replication::ReplicationRole;
using memgraph::replication_coordination_glue::ReplicationRole;
const uint64_t kNumVerifiers = 5;
const uint64_t kNumMutators = 1;

View File

@ -16,7 +16,7 @@
#include "storage/v2/constraints/constraints.hpp"
#include "storage/v2/inmemory/storage.hpp"
using memgraph::replication::ReplicationRole;
using memgraph::replication_coordination_glue::ReplicationRole;
const int kNumThreads = 8;

View File

@ -1,10 +1,8 @@
find_package(gflags REQUIRED)
copy_e2e_python_files(ha_experimental coordinator.py)
copy_e2e_python_files(ha_experimental client_initiated_failover.py)
copy_e2e_python_files(ha_experimental uninitialized_cluster.py)
copy_e2e_python_files(ha_experimental automatic_failover.py)
copy_e2e_python_files(ha_experimental common.py)
copy_e2e_python_files(ha_experimental conftest.py)
copy_e2e_python_files(ha_experimental workloads.yaml)
copy_e2e_python_files_from_parent_folder(ha_experimental ".." memgraph.py)

View File

@ -0,0 +1,337 @@
# Copyright 2022 Memgraph Ltd.
#
# Use of this software is governed by the Business Source License
# included in the file licenses/BSL.txt; by using this file, you agree to be bound by the terms of the Business Source
# License, and you may not use this file except in compliance with the Business Source License.
#
# As of the Change Date specified in that file, in accordance with
# the Business Source License, use of this software will be governed
# by the Apache License, Version 2.0, included in the file
# licenses/APL.txt.
import os
import shutil
import sys
import tempfile
import interactive_mg_runner
import pytest
from common import connect, execute_and_fetch_all, safe_execute
from mg_utils import mg_sleep_and_assert
interactive_mg_runner.SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__))
interactive_mg_runner.PROJECT_DIR = os.path.normpath(
os.path.join(interactive_mg_runner.SCRIPT_DIR, "..", "..", "..", "..")
)
interactive_mg_runner.BUILD_DIR = os.path.normpath(os.path.join(interactive_mg_runner.PROJECT_DIR, "build"))
interactive_mg_runner.MEMGRAPH_BINARY = os.path.normpath(os.path.join(interactive_mg_runner.BUILD_DIR, "memgraph"))
TEMP_DIR = tempfile.TemporaryDirectory().name
MEMGRAPH_INSTANCES_DESCRIPTION = {
"instance_1": {
"args": [
"--bolt-port",
"7688",
"--log-level",
"TRACE",
"--coordinator-server-port",
"10011",
"--replication-restore-state-on-startup",
],
"log_file": "instance_1.log",
"data_directory": f"{TEMP_DIR}/instance_1",
"setup_queries": [],
},
"instance_2": {
"args": [
"--bolt-port",
"7689",
"--log-level",
"TRACE",
"--coordinator-server-port",
"10012",
"--replication-restore-state-on-startup",
],
"log_file": "instance_2.log",
"data_directory": f"{TEMP_DIR}/instance_2",
"setup_queries": [],
},
"instance_3": {
"args": [
"--bolt-port",
"7687",
"--log-level",
"TRACE",
"--coordinator-server-port",
"10013",
"--replication-restore-state-on-startup",
],
"log_file": "instance_3.log",
"data_directory": f"{TEMP_DIR}/instance_3",
"setup_queries": [],
},
"coordinator": {
"args": ["--bolt-port", "7690", "--log-level=TRACE", "--coordinator"],
"log_file": "coordinator.log",
"setup_queries": [
"REGISTER INSTANCE instance_1 ON '127.0.0.1:10011' WITH '127.0.0.1:10001';",
"REGISTER INSTANCE instance_2 ON '127.0.0.1:10012' WITH '127.0.0.1:10002';",
"REGISTER INSTANCE instance_3 ON '127.0.0.1:10013' WITH '127.0.0.1:10003';",
"SET INSTANCE instance_3 TO MAIN",
],
},
}
def test_show_replication_cluster():
safe_execute(shutil.rmtree, TEMP_DIR)
interactive_mg_runner.start_all(MEMGRAPH_INSTANCES_DESCRIPTION)
instance1_cursor = connect(host="localhost", port=7688).cursor()
instance2_cursor = connect(host="localhost", port=7689).cursor()
instance3_cursor = connect(host="localhost", port=7687).cursor()
coord_cursor = connect(host="localhost", port=7690).cursor()
def show_repl_cluster():
return sorted(list(execute_and_fetch_all(coord_cursor, "SHOW REPLICATION CLUSTER;")))
expected_data = [
("instance_1", "127.0.0.1:10011", True, "replica"),
("instance_2", "127.0.0.1:10012", True, "replica"),
("instance_3", "127.0.0.1:10013", True, "main"),
]
mg_sleep_and_assert(expected_data, show_repl_cluster)
def retrieve_data_show_repl_role_instance1():
return sorted(list(execute_and_fetch_all(instance1_cursor, "SHOW REPLICATION ROLE;")))
def retrieve_data_show_repl_role_instance2():
return sorted(list(execute_and_fetch_all(instance2_cursor, "SHOW REPLICATION ROLE;")))
def retrieve_data_show_repl_role_instance3():
return sorted(list(execute_and_fetch_all(instance3_cursor, "SHOW REPLICATION ROLE;")))
mg_sleep_and_assert([("replica",)], retrieve_data_show_repl_role_instance1)
mg_sleep_and_assert([("replica",)], retrieve_data_show_repl_role_instance2)
mg_sleep_and_assert([("main",)], retrieve_data_show_repl_role_instance3)
interactive_mg_runner.kill(MEMGRAPH_INSTANCES_DESCRIPTION, "instance_1")
expected_data = [
("instance_1", "127.0.0.1:10011", False, "unknown"),
("instance_2", "127.0.0.1:10012", True, "replica"),
("instance_3", "127.0.0.1:10013", True, "main"),
]
mg_sleep_and_assert(expected_data, show_repl_cluster)
interactive_mg_runner.kill(MEMGRAPH_INSTANCES_DESCRIPTION, "instance_2")
expected_data = [
("instance_1", "127.0.0.1:10011", False, "unknown"),
("instance_2", "127.0.0.1:10012", False, "unknown"),
("instance_3", "127.0.0.1:10013", True, "main"),
]
mg_sleep_and_assert(expected_data, show_repl_cluster)
def test_simple_automatic_failover():
safe_execute(shutil.rmtree, TEMP_DIR)
interactive_mg_runner.start_all(MEMGRAPH_INSTANCES_DESCRIPTION)
main_cursor = connect(host="localhost", port=7687).cursor()
expected_data_on_main = [
("instance_1", "127.0.0.1:10001", "sync", 0, 0, "ready"),
("instance_2", "127.0.0.1:10002", "sync", 0, 0, "ready"),
]
actual_data_on_main = sorted(list(execute_and_fetch_all(main_cursor, "SHOW REPLICAS;")))
assert actual_data_on_main == expected_data_on_main
interactive_mg_runner.kill(MEMGRAPH_INSTANCES_DESCRIPTION, "instance_3")
coord_cursor = connect(host="localhost", port=7690).cursor()
def retrieve_data_show_repl_cluster():
return sorted(list(execute_and_fetch_all(coord_cursor, "SHOW REPLICATION CLUSTER;")))
expected_data_on_coord = [
("instance_1", "127.0.0.1:10011", True, "main"),
("instance_2", "127.0.0.1:10012", True, "replica"),
("instance_3", "127.0.0.1:10013", False, "unknown"),
]
mg_sleep_and_assert(expected_data_on_coord, retrieve_data_show_repl_cluster)
new_main_cursor = connect(host="localhost", port=7688).cursor()
def retrieve_data_show_replicas():
return sorted(list(execute_and_fetch_all(new_main_cursor, "SHOW REPLICAS;")))
expected_data_on_new_main = [
("instance_2", "127.0.0.1:10002", "sync", 0, 0, "ready"),
]
mg_sleep_and_assert(expected_data_on_new_main, retrieve_data_show_replicas)
def test_registering_replica_fails_name_exists():
safe_execute(shutil.rmtree, TEMP_DIR)
interactive_mg_runner.start_all(MEMGRAPH_INSTANCES_DESCRIPTION)
coord_cursor = connect(host="localhost", port=7690).cursor()
with pytest.raises(Exception) as e:
execute_and_fetch_all(
coord_cursor,
"REGISTER INSTANCE instance_1 ON '127.0.0.1:10051' WITH '127.0.0.1:10111';",
)
assert str(e.value) == "Couldn't register replica instance since instance with such name already exists!"
shutil.rmtree(TEMP_DIR)
def test_registering_replica_fails_endpoint_exists():
safe_execute(shutil.rmtree, TEMP_DIR)
interactive_mg_runner.start_all(MEMGRAPH_INSTANCES_DESCRIPTION)
coord_cursor = connect(host="localhost", port=7690).cursor()
with pytest.raises(Exception) as e:
execute_and_fetch_all(
coord_cursor,
"REGISTER INSTANCE instance_5 ON '127.0.0.1:10011' WITH '127.0.0.1:10005';",
)
assert str(e.value) == "Couldn't register replica instance since instance with such endpoint already exists!"
def test_replica_instance_restarts():
safe_execute(shutil.rmtree, TEMP_DIR)
interactive_mg_runner.start_all(MEMGRAPH_INSTANCES_DESCRIPTION)
cursor = connect(host="localhost", port=7690).cursor()
def show_repl_cluster():
return sorted(list(execute_and_fetch_all(cursor, "SHOW REPLICATION CLUSTER;")))
expected_data_up = [
("instance_1", "127.0.0.1:10011", True, "replica"),
("instance_2", "127.0.0.1:10012", True, "replica"),
("instance_3", "127.0.0.1:10013", True, "main"),
]
mg_sleep_and_assert(expected_data_up, show_repl_cluster)
interactive_mg_runner.kill(MEMGRAPH_INSTANCES_DESCRIPTION, "instance_1")
expected_data_down = [
("instance_1", "127.0.0.1:10011", False, "unknown"),
("instance_2", "127.0.0.1:10012", True, "replica"),
("instance_3", "127.0.0.1:10013", True, "main"),
]
mg_sleep_and_assert(expected_data_down, show_repl_cluster)
interactive_mg_runner.start(MEMGRAPH_INSTANCES_DESCRIPTION, "instance_1")
mg_sleep_and_assert(expected_data_up, show_repl_cluster)
instance1_cursor = connect(host="localhost", port=7688).cursor()
def retrieve_data_show_repl_role_instance1():
return sorted(list(execute_and_fetch_all(instance1_cursor, "SHOW REPLICATION ROLE;")))
expected_data_replica = [("replica",)]
mg_sleep_and_assert(expected_data_replica, retrieve_data_show_repl_role_instance1)
def test_automatic_failover_main_back_as_replica():
safe_execute(shutil.rmtree, TEMP_DIR)
interactive_mg_runner.start_all(MEMGRAPH_INSTANCES_DESCRIPTION)
interactive_mg_runner.kill(MEMGRAPH_INSTANCES_DESCRIPTION, "instance_3")
coord_cursor = connect(host="localhost", port=7690).cursor()
def retrieve_data_show_repl_cluster():
return sorted(list(execute_and_fetch_all(coord_cursor, "SHOW REPLICATION CLUSTER;")))
expected_data_after_failover = [
("instance_1", "127.0.0.1:10011", True, "main"),
("instance_2", "127.0.0.1:10012", True, "replica"),
("instance_3", "127.0.0.1:10013", False, "unknown"),
]
mg_sleep_and_assert(expected_data_after_failover, retrieve_data_show_repl_cluster)
expected_data_after_main_coming_back = [
("instance_1", "127.0.0.1:10011", True, "main"),
("instance_2", "127.0.0.1:10012", True, "replica"),
("instance_3", "127.0.0.1:10013", True, "replica"),
]
interactive_mg_runner.start(MEMGRAPH_INSTANCES_DESCRIPTION, "instance_3")
mg_sleep_and_assert(expected_data_after_main_coming_back, retrieve_data_show_repl_cluster)
instance3_cursor = connect(host="localhost", port=7687).cursor()
def retrieve_data_show_repl_role_instance3():
return sorted(list(execute_and_fetch_all(instance3_cursor, "SHOW REPLICATION ROLE;")))
mg_sleep_and_assert([("replica",)], retrieve_data_show_repl_role_instance3)
def test_automatic_failover_main_back_as_main():
safe_execute(shutil.rmtree, TEMP_DIR)
interactive_mg_runner.start_all(MEMGRAPH_INSTANCES_DESCRIPTION)
interactive_mg_runner.kill(MEMGRAPH_INSTANCES_DESCRIPTION, "instance_1")
interactive_mg_runner.kill(MEMGRAPH_INSTANCES_DESCRIPTION, "instance_2")
interactive_mg_runner.kill(MEMGRAPH_INSTANCES_DESCRIPTION, "instance_3")
coord_cursor = connect(host="localhost", port=7690).cursor()
def retrieve_data_show_repl_cluster():
return sorted(list(execute_and_fetch_all(coord_cursor, "SHOW REPLICATION CLUSTER;")))
expected_data_all_down = [
("instance_1", "127.0.0.1:10011", False, "unknown"),
("instance_2", "127.0.0.1:10012", False, "unknown"),
("instance_3", "127.0.0.1:10013", False, "unknown"),
]
mg_sleep_and_assert(expected_data_all_down, retrieve_data_show_repl_cluster)
interactive_mg_runner.start(MEMGRAPH_INSTANCES_DESCRIPTION, "instance_3")
expected_data_main_back = [
("instance_1", "127.0.0.1:10011", False, "unknown"),
("instance_2", "127.0.0.1:10012", False, "unknown"),
("instance_3", "127.0.0.1:10013", True, "main"),
]
mg_sleep_and_assert(expected_data_main_back, retrieve_data_show_repl_cluster)
instance3_cursor = connect(host="localhost", port=7687).cursor()
def retrieve_data_show_repl_role_instance3():
return sorted(list(execute_and_fetch_all(instance3_cursor, "SHOW REPLICATION ROLE;")))
mg_sleep_and_assert([("main",)], retrieve_data_show_repl_role_instance3)
interactive_mg_runner.start(MEMGRAPH_INSTANCES_DESCRIPTION, "instance_1")
interactive_mg_runner.start(MEMGRAPH_INSTANCES_DESCRIPTION, "instance_2")
expected_data_replicas_back = [
("instance_1", "127.0.0.1:10011", True, "replica"),
("instance_2", "127.0.0.1:10012", True, "replica"),
("instance_3", "127.0.0.1:10013", True, "main"),
]
mg_sleep_and_assert(expected_data_replicas_back, retrieve_data_show_repl_cluster)
instance1_cursor = connect(host="localhost", port=7688).cursor()
instance2_cursor = connect(host="localhost", port=7689).cursor()
def retrieve_data_show_repl_role_instance1():
return sorted(list(execute_and_fetch_all(instance1_cursor, "SHOW REPLICATION ROLE;")))
def retrieve_data_show_repl_role_instance2():
return sorted(list(execute_and_fetch_all(instance2_cursor, "SHOW REPLICATION ROLE;")))
mg_sleep_and_assert([("replica",)], retrieve_data_show_repl_role_instance1)
mg_sleep_and_assert([("replica",)], retrieve_data_show_repl_role_instance2)
mg_sleep_and_assert([("main",)], retrieve_data_show_repl_role_instance3)
if __name__ == "__main__":
sys.exit(pytest.main([__file__, "-rA"]))

View File

@ -1,224 +0,0 @@
# Copyright 2022 Memgraph Ltd.
#
# Use of this software is governed by the Business Source License
# included in the file licenses/BSL.txt; by using this file, you agree to be bound by the terms of the Business Source
# License, and you may not use this file except in compliance with the Business Source License.
#
# As of the Change Date specified in that file, in accordance with
# the Business Source License, use of this software will be governed
# by the Apache License, Version 2.0, included in the file
# licenses/APL.txt.
import os
import sys
import interactive_mg_runner
import pytest
from common import execute_and_fetch_all
from mg_utils import mg_sleep_and_assert
interactive_mg_runner.SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__))
interactive_mg_runner.PROJECT_DIR = os.path.normpath(
os.path.join(interactive_mg_runner.SCRIPT_DIR, "..", "..", "..", "..")
)
interactive_mg_runner.BUILD_DIR = os.path.normpath(os.path.join(interactive_mg_runner.PROJECT_DIR, "build"))
interactive_mg_runner.MEMGRAPH_BINARY = os.path.normpath(os.path.join(interactive_mg_runner.BUILD_DIR, "memgraph"))
MEMGRAPH_INSTANCES_DESCRIPTION = {
"instance_1": {
"args": ["--bolt-port", "7688", "--log-level", "TRACE", "--coordinator-server-port", "10011"],
"log_file": "replica1.log",
"setup_queries": ["SET REPLICATION ROLE TO REPLICA WITH PORT 10001;"],
},
"instance_2": {
"args": ["--bolt-port", "7689", "--log-level", "TRACE", "--coordinator-server-port", "10012"],
"log_file": "replica2.log",
"setup_queries": ["SET REPLICATION ROLE TO REPLICA WITH PORT 10002;"],
},
"instance_3": {
"args": ["--bolt-port", "7687", "--log-level", "TRACE", "--coordinator-server-port", "10013"],
"log_file": "main.log",
"setup_queries": [
"REGISTER REPLICA instance_1 SYNC TO '127.0.0.1:10001'",
"REGISTER REPLICA instance_2 SYNC TO '127.0.0.1:10002'",
],
},
"coordinator": {
"args": ["--bolt-port", "7690", "--log-level=TRACE", "--coordinator"],
"log_file": "replica3.log",
"setup_queries": [
"REGISTER REPLICA instance_1 SYNC TO '127.0.0.1:10001' WITH COORDINATOR SERVER ON '127.0.0.1:10011';",
"REGISTER REPLICA instance_2 SYNC TO '127.0.0.1:10002' WITH COORDINATOR SERVER ON '127.0.0.1:10012';",
"REGISTER MAIN instance_3 WITH COORDINATOR SERVER ON '127.0.0.1:10013';",
],
},
}
def test_show_replication_cluster(connection):
# Goal of this test is to check the SHOW REPLICATION CLUSTER command.
# 1. We start all replicas, main and coordinator manually: we want to be able to kill them ourselves without relying on external tooling to kill processes.
# 2. We check that all replicas and main have the correct state: they should all be alive.
# 3. We kill one replica. It should not appear anymore in the SHOW REPLICATION CLUSTER command.
# 4. We kill main. It should not appear anymore in the SHOW REPLICATION CLUSTER command.
# 1.
interactive_mg_runner.start_all(MEMGRAPH_INSTANCES_DESCRIPTION)
cursor = connection(7690, "coordinator").cursor()
# 2.
# We leave some time for the coordinator to realise the replicas are down.
def retrieve_data():
return set(execute_and_fetch_all(cursor, "SHOW REPLICATION CLUSTER;"))
expected_data = {
("instance_1", "127.0.0.1:10011", True, "replica"),
("instance_2", "127.0.0.1:10012", True, "replica"),
("instance_3", "127.0.0.1:10013", True, "main"),
}
mg_sleep_and_assert(expected_data, retrieve_data)
# 3.
interactive_mg_runner.kill(MEMGRAPH_INSTANCES_DESCRIPTION, "instance_1")
expected_data = {
("instance_2", "127.0.0.1:10012", True, "replica"),
("instance_3", "127.0.0.1:10013", True, "main"),
("instance_1", "127.0.0.1:10011", False, "replica"),
}
mg_sleep_and_assert(expected_data, retrieve_data)
# 4.
interactive_mg_runner.kill(MEMGRAPH_INSTANCES_DESCRIPTION, "instance_3")
expected_data = {
("instance_2", "127.0.0.1:10012", True, "replica"),
("instance_1", "127.0.0.1:10011", False, "replica"),
("instance_3", "127.0.0.1:10013", False, "main"),
}
mg_sleep_and_assert(expected_data, retrieve_data)
def test_simple_client_initiated_failover(connection):
# 1. Start all instances
# 2. Kill main
# 3. Run DO FAILOVER on COORDINATOR
# 4. Assert new config on coordinator by running show replication cluster
# 5. Assert replicas on new main
# 1.
interactive_mg_runner.start_all(MEMGRAPH_INSTANCES_DESCRIPTION)
# 2.
main_cursor = connection(7687, "instance_3").cursor()
expected_data_on_main = {
("instance_1", "127.0.0.1:10001", "sync", 0, 0, "ready"),
("instance_2", "127.0.0.1:10002", "sync", 0, 0, "ready"),
}
actual_data_on_main = set(execute_and_fetch_all(main_cursor, "SHOW REPLICAS;"))
assert actual_data_on_main == expected_data_on_main
interactive_mg_runner.kill(MEMGRAPH_INSTANCES_DESCRIPTION, "instance_3")
coord_cursor = connection(7690, "coordinator").cursor()
def retrieve_data_show_repl_cluster():
return set(execute_and_fetch_all(coord_cursor, "SHOW REPLICATION CLUSTER;"))
expected_data_on_coord = {
("instance_1", "127.0.0.1:10011", True, "replica"),
("instance_2", "127.0.0.1:10012", True, "replica"),
("instance_3", "127.0.0.1:10013", False, "main"),
}
mg_sleep_and_assert(expected_data_on_coord, retrieve_data_show_repl_cluster)
# 3.
execute_and_fetch_all(coord_cursor, "DO FAILOVER")
expected_data_on_coord = {
("instance_1", "127.0.0.1:10011", True, "main"),
("instance_2", "127.0.0.1:10012", True, "replica"),
}
mg_sleep_and_assert(expected_data_on_coord, retrieve_data_show_repl_cluster)
# 4.
new_main_cursor = connection(7688, "instance_1").cursor()
def retrieve_data_show_replicas():
return set(execute_and_fetch_all(new_main_cursor, "SHOW REPLICAS;"))
expected_data_on_new_main = {
("instance_2", "127.0.0.1:10002", "sync", 0, 0, "ready"),
}
mg_sleep_and_assert(expected_data_on_new_main, retrieve_data_show_replicas)
def test_failover_fails_all_replicas_down(connection):
# 1. Start all instances
# 2. Kill all replicas
# 3. Kill main
# 4. Run DO FAILOVER on COORDINATOR. Assert exception is being thrown due to all replicas being down
# 5. Assert cluster status didn't change
# 1.
interactive_mg_runner.start_all(MEMGRAPH_INSTANCES_DESCRIPTION)
# 2.
interactive_mg_runner.kill(MEMGRAPH_INSTANCES_DESCRIPTION, "instance_1")
interactive_mg_runner.kill(MEMGRAPH_INSTANCES_DESCRIPTION, "instance_2")
# 3.
interactive_mg_runner.kill(MEMGRAPH_INSTANCES_DESCRIPTION, "instance_3")
coord_cursor = connection(7690, "coordinator").cursor()
# 4.
with pytest.raises(Exception) as e:
execute_and_fetch_all(coord_cursor, "DO FAILOVER;")
assert str(e.value) == "Failover aborted since all replicas are down!"
# 5.
def retrieve_data():
return set(execute_and_fetch_all(coord_cursor, "SHOW REPLICATION CLUSTER;"))
expected_data_on_coord = {
("instance_1", "127.0.0.1:10011", False, "replica"),
("instance_2", "127.0.0.1:10012", False, "replica"),
("instance_3", "127.0.0.1:10013", False, "main"),
}
mg_sleep_and_assert(expected_data_on_coord, retrieve_data)
def test_failover_fails_main_is_alive(connection):
# 1. Start all instances
# 2. Run DO FAILOVER on COORDINATOR. Assert exception is being thrown due to main is still live.
# 3. Assert cluster status didn't change
# 1.
interactive_mg_runner.start_all(MEMGRAPH_INSTANCES_DESCRIPTION)
# 2.
coord_cursor = connection(7690, "coordinator").cursor()
def retrieve_data():
return set(execute_and_fetch_all(coord_cursor, "SHOW REPLICATION CLUSTER;"))
expected_data_on_coord = {
("instance_1", "127.0.0.1:10011", True, "replica"),
("instance_2", "127.0.0.1:10012", True, "replica"),
("instance_3", "127.0.0.1:10013", True, "main"),
}
mg_sleep_and_assert(expected_data_on_coord, retrieve_data)
# 4.
with pytest.raises(Exception) as e:
execute_and_fetch_all(coord_cursor, "DO FAILOVER;")
assert str(e.value) == "Failover aborted since main is alive!"
# 5.
mg_sleep_and_assert(expected_data_on_coord, retrieve_data)
if __name__ == "__main__":
sys.exit(pytest.main([__file__, "-rA"]))

View File

@ -23,3 +23,10 @@ def connect(**kwargs) -> mgclient.Connection:
connection = mgclient.connect(**kwargs)
connection.autocommit = True
return connection
def safe_execute(function, *args):
try:
function(*args)
except:
pass

View File

@ -1,43 +0,0 @@
# Copyright 2022 Memgraph Ltd.
#
# Use of this software is governed by the Business Source License
# included in the file licenses/BSL.txt; by using this file, you agree to be bound by the terms of the Business Source
# License, and you may not use this file except in compliance with the Business Source License.
#
# As of the Change Date specified in that file, in accordance with
# the Business Source License, use of this software will be governed
# by the Apache License, Version 2.0, included in the file
# licenses/APL.txt.
import pytest
from common import connect, execute_and_fetch_all
# The fixture here is more complex because the connection has to be
# parameterized based on the test parameters (info has to be available on both
# sides).
#
# https://docs.pytest.org/en/latest/example/parametrize.html#indirect-parametrization
# is not an elegant/feasible solution here.
#
# The solution was independently developed and then I stumbled upon the same
# approach here https://stackoverflow.com/a/68286553/4888809 which I think is
# optimal.
@pytest.fixture(scope="function")
def connection():
connection_holder = None
role_holder = None
def inner_connection(port, role):
nonlocal connection_holder, role_holder
connection_holder = connect(host="localhost", port=port)
role_holder = role
return connection_holder
yield inner_connection
# Only main instance can be cleaned up because replicas do NOT accept
# writes.
if role_holder == "main":
cursor = connection_holder.cursor()
execute_and_fetch_all(cursor, "MATCH (n) DETACH DELETE n;")

View File

@ -12,87 +12,76 @@
import sys
import pytest
from common import execute_and_fetch_all
from common import connect, execute_and_fetch_all
from mg_utils import mg_sleep_and_assert
def test_disable_cypher_queries(connection):
cursor = connection(7690, "coordinator").cursor()
def test_disable_cypher_queries():
cursor = connect(host="localhost", port=7690).cursor()
with pytest.raises(Exception) as e:
execute_and_fetch_all(cursor, "CREATE (n:TestNode {prop: 'test'})")
assert str(e.value) == "Coordinator can run only coordinator queries!"
def test_coordinator_cannot_be_replica_role(connection):
cursor = connection(7690, "coordinator").cursor()
cursor = connect(host="localhost", port=7690).cursor()
with pytest.raises(Exception) as e:
execute_and_fetch_all(cursor, "SET REPLICATION ROLE TO REPLICA WITH PORT 10001;")
assert str(e.value) == "Coordinator can run only coordinator queries!"
def test_coordinator_cannot_run_show_repl_role(connection):
cursor = connection(7690, "coordinator").cursor()
def test_coordinator_cannot_run_show_repl_role():
cursor = connect(host="localhost", port=7690).cursor()
with pytest.raises(Exception) as e:
execute_and_fetch_all(cursor, "SHOW REPLICATION ROLE;")
assert str(e.value) == "Coordinator can run only coordinator queries!"
def test_coordinator_show_replication_cluster(connection):
cursor = connection(7690, "coordinator").cursor()
def test_coordinator_show_replication_cluster():
cursor = connect(host="localhost", port=7690).cursor()
def retrieve_data():
return set(execute_and_fetch_all(cursor, "SHOW REPLICATION CLUSTER;"))
return sorted(list(execute_and_fetch_all(cursor, "SHOW REPLICATION CLUSTER;")))
expected_data = {
("main", "127.0.0.1:10013", True, "main"),
("replica_1", "127.0.0.1:10011", True, "replica"),
("replica_2", "127.0.0.1:10012", True, "replica"),
}
expected_data = [
("instance_1", "127.0.0.1:10011", True, "replica"),
("instance_2", "127.0.0.1:10012", True, "replica"),
("instance_3", "127.0.0.1:10013", True, "main"),
]
mg_sleep_and_assert(expected_data, retrieve_data)
def test_coordinator_cannot_call_show_replicas(connection):
cursor = connection(7690, "coordinator").cursor()
cursor = connect(host="localhost", port=7690).cursor()
with pytest.raises(Exception) as e:
execute_and_fetch_all(cursor, "SHOW REPLICAS;")
assert str(e.value) == "Coordinator can run only coordinator queries!"
@pytest.mark.parametrize(
"port, role",
[(7687, "main"), (7688, "replica"), (7689, "replica")],
"port",
[7687, 7688, 7689],
)
def test_main_and_replicas_cannot_call_show_repl_cluster(port, role, connection):
cursor = connection(port, role).cursor()
def test_main_and_replicas_cannot_call_show_repl_cluster(port):
cursor = connect(host="localhost", port=port).cursor()
with pytest.raises(Exception) as e:
execute_and_fetch_all(cursor, "SHOW REPLICATION CLUSTER;")
assert str(e.value) == "Only coordinator can run SHOW REPLICATION CLUSTER."
@pytest.mark.parametrize(
"port, role",
[(7687, "main"), (7688, "replica"), (7689, "replica")],
"port",
[7687, 7688, 7689],
)
def test_main_and_replicas_cannot_register_coord_server(port, role, connection):
cursor = connection(port, role).cursor()
def test_main_and_replicas_cannot_register_coord_server(port):
cursor = connect(host="localhost", port=port).cursor()
with pytest.raises(Exception) as e:
execute_and_fetch_all(
cursor,
"REGISTER REPLICA instance_1 SYNC TO '127.0.0.1:10001' WITH COORDINATOR SERVER ON '127.0.0.1:10011';",
"REGISTER INSTANCE instance_1 ON '127.0.0.1:10001' WITH '127.0.0.1:10011';",
)
assert str(e.value) == "Only coordinator can register coordinator server!"
@pytest.mark.parametrize(
"port, role",
[(7687, "main"), (7688, "replica"), (7689, "replica")],
)
def test_main_and_replicas_cannot_run_do_failover(port, role, connection):
cursor = connection(port, role).cursor()
with pytest.raises(Exception) as e:
execute_and_fetch_all(cursor, "DO FAILOVER;")
assert str(e.value) == "Only coordinator can run DO FAILOVER!"
if __name__ == "__main__":
sys.exit(pytest.main([__file__, "-rA"]))

View File

@ -1,26 +0,0 @@
# Copyright 2022 Memgraph Ltd.
#
# Use of this software is governed by the Business Source License
# included in the file licenses/BSL.txt; by using this file, you agree to be bound by the terms of the Business Source
# License, and you may not use this file except in compliance with the Business Source License.
#
# As of the Change Date specified in that file, in accordance with
# the Business Source License, use of this software will be governed
# by the Apache License, Version 2.0, included in the file
# licenses/APL.txt.
import sys
import pytest
from common import execute_and_fetch_all
def test_failover_on_non_setup_cluster(connection):
cursor = connection(7690, "coordinator").cursor()
with pytest.raises(Exception) as e:
execute_and_fetch_all(cursor, "DO FAILOVER;")
assert str(e.value) == "Failover aborted since cluster is uninitialized!"
if __name__ == "__main__":
sys.exit(pytest.main([__file__, "-rA"]))

View File

@ -3,49 +3,24 @@ ha_cluster: &ha_cluster
replica_1:
args: ["--bolt-port", "7688", "--log-level=TRACE", "--coordinator-server-port=10011"]
log_file: "replication-e2e-replica1.log"
setup_queries: ["SET REPLICATION ROLE TO REPLICA WITH PORT 10001;"]
replica_2:
args: ["--bolt-port", "7689", "--log-level=TRACE", "--coordinator-server-port=10012"]
log_file: "replication-e2e-replica2.log"
setup_queries: ["SET REPLICATION ROLE TO REPLICA WITH PORT 10002;"]
main:
args: ["--bolt-port", "7687", "--log-level=TRACE", "--coordinator-server-port=10013"]
log_file: "replication-e2e-main.log"
setup_queries: [
"REGISTER REPLICA replica_1 SYNC TO '127.0.0.1:10001'",
"REGISTER REPLICA replica_2 SYNC TO '127.0.0.1:10002'",
]
coordinator:
args: ["--bolt-port", "7690", "--log-level=TRACE", "--coordinator"]
log_file: "replication-e2e-coordinator.log"
setup_queries: [
"REGISTER MAIN main WITH COORDINATOR SERVER ON '127.0.0.1:10013'",
"REGISTER REPLICA replica_1 SYNC TO '127.0.0.1:10001' WITH COORDINATOR SERVER ON '127.0.0.1:10011'",
"REGISTER REPLICA replica_2 SYNC TO '127.0.0.1:10002' WITH COORDINATOR SERVER ON '127.0.0.1:10012'",
]
noninitialized_cluster: &noninitialized_cluster
cluster:
replica_1:
args: ["--bolt-port", "7688", "--log-level=TRACE", "--coordinator-server-port=10011"]
log_file: "replication-e2e-replica1.log"
setup_queries: ["SET REPLICATION ROLE TO REPLICA WITH PORT 10001;"]
replica_2:
args: ["--bolt-port", "7689", "--log-level=TRACE", "--coordinator-server-port=10012"]
log_file: "replication-e2e-replica2.log"
setup_queries: ["SET REPLICATION ROLE TO REPLICA WITH PORT 10002;"]
main:
args: ["--bolt-port", "7687", "--log-level=TRACE", "--coordinator-server-port=10013"]
log_file: "replication-e2e-main.log"
setup_queries: [
"REGISTER REPLICA replica_1 SYNC TO '127.0.0.1:10001'",
"REGISTER REPLICA replica_2 SYNC TO '127.0.0.1:10002'",
]
coordinator:
args: ["--bolt-port", "7690", "--log-level=TRACE", "--coordinator"]
log_file: "replication-e2e-coordinator.log"
setup_queries: []
replica_2:
args: ["--bolt-port", "7689", "--log-level=TRACE", "--coordinator-server-port=10012"]
log_file: "replication-e2e-replica2.log"
setup_queries: []
main:
args: ["--bolt-port", "7687", "--log-level=TRACE", "--coordinator-server-port=10013"]
log_file: "replication-e2e-main.log"
setup_queries: []
coordinator:
args: ["--bolt-port", "7690", "--log-level=TRACE", "--coordinator"]
log_file: "replication-e2e-coordinator.log"
setup_queries: [
"REGISTER INSTANCE instance_1 ON '127.0.0.1:10011' WITH '127.0.0.1:10001';",
"REGISTER INSTANCE instance_2 ON '127.0.0.1:10012' WITH '127.0.0.1:10002';",
"REGISTER INSTANCE instance_3 ON '127.0.0.1:10013' WITH '127.0.0.1:10003';",
"SET INSTANCE instance_3 TO MAIN;"
]
workloads:
- name: "Coordinator"
@ -53,11 +28,6 @@ workloads:
args: ["high_availability_experimental/coordinator.py"]
<<: *ha_cluster
- name: "Uninitialized cluster"
- name: "Automatic failover"
binary: "tests/e2e/pytest_runner.sh"
args: ["high_availability_experimental/uninitialized_cluster.py"]
<<: *noninitialized_cluster
- name: "Client initiated failover"
binary: "tests/e2e/pytest_runner.sh"
args: ["high_availability_experimental/client_initiated_failover.py"]
args: ["high_availability_experimental/automatic_failover.py"]

View File

@ -14,7 +14,7 @@
#include <gflags/gflags.h>
#include "storage/v2/inmemory/storage.hpp"
using memgraph::replication::ReplicationRole;
using memgraph::replication_coordination_glue::ReplicationRole;
DECLARE_int32(min_log_level);
int main(int argc, char *argv[]) {

View File

@ -23,7 +23,7 @@
#include "storage/v2/inmemory/storage.hpp"
#include "storage/v2/storage.hpp"
#include "storage/v2/vertex_accessor.hpp"
using memgraph::replication::ReplicationRole;
using memgraph::replication_coordination_glue::ReplicationRole;
/**
* It is possible to run test with custom seed with:
* RC_PARAMS="seed=1" ./random_graph

View File

@ -22,7 +22,7 @@
#include "storage/v2/disk/storage.hpp"
#include "storage/v2/inmemory/storage.hpp"
#include "storage/v2/view.hpp"
using memgraph::replication::ReplicationRole;
using memgraph::replication_coordination_glue::ReplicationRole;
#ifdef MG_ENTERPRISE
template <typename StorageType>
class FineGrainedAuthCheckerFixture : public testing::Test {

View File

@ -43,7 +43,7 @@ class VertexDb : public Database {
}
std::unique_ptr<memgraph::storage::Storage::Accessor> Access() override {
return db_->Access(memgraph::replication::ReplicationRole::MAIN);
return db_->Access(memgraph::replication_coordination_glue::ReplicationRole::MAIN);
}
std::unique_ptr<LogicalOperator> MakeBfsOperator(Symbol source_sym, Symbol sink_sym, Symbol edge_sym,

View File

@ -32,7 +32,7 @@ class SingleNodeDb : public Database {
}
std::unique_ptr<memgraph::storage::Storage::Accessor> Access() override {
return db_->Access(memgraph::replication::ReplicationRole::MAIN);
return db_->Access(memgraph::replication_coordination_glue::ReplicationRole::MAIN);
}
std::unique_ptr<LogicalOperator> MakeBfsOperator(Symbol source_sym, Symbol sink_sym, Symbol edge_sym,

View File

@ -182,7 +182,7 @@ void TestVertexAndEdgeWithDifferentStorages(std::unique_ptr<memgraph::storage::S
output.clear();
// create vertex
auto dba = db->Access(memgraph::replication::ReplicationRole::MAIN);
auto dba = db->Access(memgraph::replication_coordination_glue::ReplicationRole::MAIN);
auto va1 = dba->CreateVertex();
auto va2 = dba->CreateVertex();
auto l1 = dba->NameToLabel("label1");

View File

@ -20,7 +20,7 @@
#include "storage/v2/property_value.hpp"
#include "storage/v2/view.hpp"
using memgraph::replication::ReplicationRole;
using memgraph::replication_coordination_glue::ReplicationRole;
class ClearingOldDiskDataTest : public ::testing::Test {
public:

View File

@ -43,7 +43,8 @@ struct CppApiTestFixture : public ::testing::Test {
}
memgraph::query::DbAccessor &CreateDbAccessor(const memgraph::storage::IsolationLevel isolationLevel) {
accessors_.push_back(storage->Access(memgraph::replication::ReplicationRole::MAIN, isolationLevel));
accessors_.push_back(
storage->Access(memgraph::replication_coordination_glue::ReplicationRole::MAIN, isolationLevel));
db_accessors_.emplace_back(accessors_.back().get());
return db_accessors_.back();
}

View File

@ -2632,77 +2632,6 @@ TEST_P(CypherMainVisitorTest, TestRegisterReplicationQuery) {
ReplicationQuery::SyncMode::SYNC);
}
#ifdef MG_ENTERPRISE
TEST_P(CypherMainVisitorTest, TestRegisterCoordinatorServer) {
auto &ast_generator = *GetParam();
{
const std::string faulty_query_1 = "REGISTER MAIN COORDINATOR SERVER TO";
ASSERT_THROW(ast_generator.ParseQuery(faulty_query_1), SyntaxException);
}
{
const std::string faulty_query_2 = "REGISTER MAIN COORDINATOR SERVER TO MAIN";
ASSERT_THROW(ast_generator.ParseQuery(faulty_query_2), SyntaxException);
}
{
std::string full_query = "REGISTER MAIN main WITH COORDINATOR SERVER ON '127.0.0.1:10011';";
auto *full_query_parsed = dynamic_cast<CoordinatorQuery *>(ast_generator.ParseQuery(full_query));
ASSERT_TRUE(full_query_parsed);
EXPECT_EQ(full_query_parsed->action_, CoordinatorQuery::Action::REGISTER_MAIN_COORDINATOR_SERVER);
EXPECT_EQ(full_query_parsed->role_, CoordinatorQuery::ReplicationRole::MAIN);
EXPECT_EQ(full_query_parsed->instance_name_, "main");
ast_generator.CheckLiteral(full_query_parsed->coordinator_socket_address_, "127.0.0.1:10011");
ASSERT_EQ(full_query_parsed->socket_address_, nullptr);
}
{
std::string full_query =
R"(REGISTER REPLICA replica_1 SYNC TO "127.0.0.1:10002" WITH COORDINATOR SERVER ON "127.0.0.1:10012")";
auto *full_query_parsed = dynamic_cast<CoordinatorQuery *>(ast_generator.ParseQuery(full_query));
ASSERT_TRUE(full_query_parsed);
EXPECT_EQ(full_query_parsed->action_, CoordinatorQuery::Action::REGISTER_REPLICA_COORDINATOR_SERVER);
EXPECT_EQ(full_query_parsed->role_, CoordinatorQuery::ReplicationRole::REPLICA);
ast_generator.CheckLiteral(full_query_parsed->socket_address_, "127.0.0.1:10002");
ast_generator.CheckLiteral(full_query_parsed->coordinator_socket_address_, "127.0.0.1:10012");
EXPECT_EQ(full_query_parsed->instance_name_, "replica_1");
EXPECT_EQ(full_query_parsed->sync_mode_, CoordinatorQuery::SyncMode::SYNC);
}
{
std::string full_query =
R"(REGISTER REPLICA replica_1 ASYNC TO '127.0.0.1:10002' WITH COORDINATOR SERVER ON '127.0.0.1:10012')";
auto *full_query_parsed = dynamic_cast<CoordinatorQuery *>(ast_generator.ParseQuery(full_query));
ASSERT_TRUE(full_query_parsed);
EXPECT_EQ(full_query_parsed->action_, CoordinatorQuery::Action::REGISTER_REPLICA_COORDINATOR_SERVER);
EXPECT_EQ(full_query_parsed->role_, CoordinatorQuery::ReplicationRole::REPLICA);
ast_generator.CheckLiteral(full_query_parsed->socket_address_, "127.0.0.1:10002");
ast_generator.CheckLiteral(full_query_parsed->coordinator_socket_address_, "127.0.0.1:10012");
EXPECT_EQ(full_query_parsed->instance_name_, "replica_1");
EXPECT_EQ(full_query_parsed->sync_mode_, CoordinatorQuery::SyncMode::ASYNC);
}
}
TEST_P(CypherMainVisitorTest, TestDoFailover) {
auto &ast_generator = *GetParam();
{
std::string invalid_query = "DO FAILO";
ASSERT_THROW(ast_generator.ParseQuery(invalid_query), SyntaxException);
}
{
std::string correct_query = "DO FAILOVER";
auto *correct_query_parsed = dynamic_cast<CoordinatorQuery *>(ast_generator.ParseQuery(correct_query));
ASSERT_TRUE(correct_query_parsed);
EXPECT_EQ(correct_query_parsed->action_, CoordinatorQuery::Action::DO_FAILOVER);
}
}
#endif
TEST_P(CypherMainVisitorTest, TestDeleteReplica) {
auto &ast_generator = *GetParam();

View File

@ -165,8 +165,8 @@ TYPED_TEST(InfoTest, InfoCheck) {
ASSERT_FALSE(unique_acc->Commit().HasError());
}
const auto &info =
db_acc->GetInfo(true, memgraph::replication::ReplicationRole::MAIN); // force to use configured directory
const auto &info = db_acc->GetInfo(
true, memgraph::replication_coordination_glue::ReplicationRole::MAIN); // force to use configured directory
ASSERT_EQ(info.storage_info.vertex_count, 5);
ASSERT_EQ(info.storage_info.edge_count, 2);

View File

@ -43,7 +43,7 @@ class PrintToJsonTest : public ::testing::Test {
PrintToJsonTest()
: config(disk_test_utils::GenerateOnDiskConfig(testSuite)),
db(new StorageType(config)),
dba_storage(db->Access(memgraph::replication::ReplicationRole::MAIN)),
dba_storage(db->Access(memgraph::replication_coordination_glue::ReplicationRole::MAIN)),
dba(dba_storage.get()) {}
~PrintToJsonTest() override {

View File

@ -23,7 +23,7 @@
using namespace memgraph::query;
using namespace memgraph::query::plan;
using memgraph::replication::ReplicationRole;
using memgraph::replication_coordination_glue::ReplicationRole;
using CardParam = CostEstimator<memgraph::query::DbAccessor>::CardParam;
using CostParam = CostEstimator<memgraph::query::DbAccessor>::CostParam;
using MiscParam = CostEstimator<memgraph::query::DbAccessor>::MiscParam;

View File

@ -141,7 +141,7 @@ DatabaseState GetState(memgraph::storage::Storage *db) {
// Capture all vertices
std::map<memgraph::storage::Gid, int64_t> gid_mapping;
std::set<DatabaseState::Vertex> vertices;
auto dba = db->Access(memgraph::replication::ReplicationRole::MAIN);
auto dba = db->Access(memgraph::replication_coordination_glue::ReplicationRole::MAIN);
for (const auto &vertex : dba->Vertices(memgraph::storage::View::NEW)) {
std::set<std::string, std::less<>> labels;
auto maybe_labels = vertex.Labels(memgraph::storage::View::NEW);
@ -1105,7 +1105,7 @@ TYPED_TEST(DumpTest, MultiplePartialPulls) {
}
TYPED_TEST(DumpTest, DumpDatabaseWithTriggers) {
auto acc = this->db->storage()->Access(memgraph::replication::ReplicationRole::MAIN);
auto acc = this->db->storage()->Access(memgraph::replication_coordination_glue::ReplicationRole::MAIN);
memgraph::query::DbAccessor dba(acc.get());
{
auto trigger_store = this->db.get()->trigger_store();

View File

@ -67,7 +67,7 @@ class ExpressionEvaluatorTest : public ::testing::Test {
ExpressionEvaluatorTest()
: config(disk_test_utils::GenerateOnDiskConfig(testSuite)),
db(new StorageType(config)),
storage_dba(db->Access(memgraph::replication::ReplicationRole::MAIN)),
storage_dba(db->Access(memgraph::replication_coordination_glue::ReplicationRole::MAIN)),
dba(storage_dba.get()) {}
~ExpressionEvaluatorTest() override {

View File

@ -39,7 +39,7 @@ class HintProviderSuite : public ::testing::Test {
int symbol_count = 0;
void SetUp() {
storage_dba.emplace(db->Access(memgraph::replication::ReplicationRole::MAIN));
storage_dba.emplace(db->Access(memgraph::replication_coordination_glue::ReplicationRole::MAIN));
dba.emplace(storage_dba->get());
}

View File

@ -25,7 +25,7 @@
#include "storage/v2/disk/storage.hpp"
#include "storage/v2/inmemory/storage.hpp"
using memgraph::replication::ReplicationRole;
using memgraph::replication_coordination_glue::ReplicationRole;
using namespace memgraph::query;
using namespace memgraph::query::plan;

View File

@ -31,7 +31,7 @@
#include "query_plan_common.hpp"
using memgraph::replication::ReplicationRole;
using memgraph::replication_coordination_glue::ReplicationRole;
using namespace memgraph::query;
using namespace memgraph::query::plan;

View File

@ -38,7 +38,7 @@
using namespace memgraph::query;
using namespace memgraph::query::plan;
using memgraph::replication::ReplicationRole;
using memgraph::replication_coordination_glue::ReplicationRole;
template <typename StorageType>
class QueryPlanTest : public testing::Test {

View File

@ -42,7 +42,7 @@
using namespace memgraph::query;
using namespace memgraph::query::plan;
using memgraph::replication::ReplicationRole;
using memgraph::replication_coordination_glue::ReplicationRole;
const std::string testSuite = "query_plan_match_filter_return";

View File

@ -37,7 +37,7 @@ class OperatorToStringTest : public ::testing::Test {
OperatorToStringTest()
: config(disk_test_utils::GenerateOnDiskConfig(testSuite)),
db(new StorageType(config)),
dba_storage(db->Access(memgraph::replication::ReplicationRole::MAIN)),
dba_storage(db->Access(memgraph::replication_coordination_glue::ReplicationRole::MAIN)),
dba(dba_storage.get()) {}
~OperatorToStringTest() override {

View File

@ -37,7 +37,7 @@ class ReadWriteTypeCheckTest : public ::testing::Test {
memgraph::storage::Config config = disk_test_utils::GenerateOnDiskConfig(testSuite);
std::unique_ptr<memgraph::storage::Storage> db{new StorageType(config)};
std::unique_ptr<memgraph::storage::Storage::Accessor> dba_storage{
db->Access(memgraph::replication::ReplicationRole::MAIN)};
db->Access(memgraph::replication_coordination_glue::ReplicationRole::MAIN)};
memgraph::query::DbAccessor dba{dba_storage.get()};
void TearDown() override {

View File

@ -18,7 +18,7 @@
#include "query/plan/operator.hpp"
#include "storage/v2/disk/storage.hpp"
#include "storage/v2/inmemory/storage.hpp"
using memgraph::replication::ReplicationRole;
using memgraph::replication_coordination_glue::ReplicationRole;
template <typename StorageType>
class QueryPlan : public testing::Test {

View File

@ -37,7 +37,7 @@ class ExpressionPrettyPrinterTest : public ::testing::Test {
memgraph::storage::Config config = disk_test_utils::GenerateOnDiskConfig(testSuite);
std::unique_ptr<memgraph::storage::Storage> db{new StorageType(config)};
std::unique_ptr<memgraph::storage::Storage::Accessor> storage_dba{
db->Access(memgraph::replication::ReplicationRole::MAIN)};
db->Access(memgraph::replication_coordination_glue::ReplicationRole::MAIN)};
memgraph::query::DbAccessor dba{storage_dba.get()};
AstStorage storage;

View File

@ -23,7 +23,7 @@
#include "disk_test_utils.hpp"
#include "test_utils.hpp"
using memgraph::replication::ReplicationRole;
using memgraph::replication_coordination_glue::ReplicationRole;
template <typename StorageType>
class CypherType : public testing::Test {

View File

@ -21,7 +21,7 @@
#include "storage/v2/inmemory/storage.hpp"
#include "test_utils.hpp"
using memgraph::replication::ReplicationRole;
using memgraph::replication_coordination_glue::ReplicationRole;
template <typename StorageType>
class PyModule : public testing::Test {

View File

@ -34,7 +34,7 @@
#include "utils/memory.hpp"
#include "utils/variant_helpers.hpp"
using memgraph::replication::ReplicationRole;
using memgraph::replication_coordination_glue::ReplicationRole;
#define EXPECT_SUCCESS(...) EXPECT_EQ(__VA_ARGS__, mgp_error::MGP_ERROR_NO_ERROR)

View File

@ -35,7 +35,7 @@ class TestSymbolGenerator : public ::testing::Test {
memgraph::storage::Config config = disk_test_utils::GenerateOnDiskConfig(testSuite);
std::unique_ptr<memgraph::storage::Storage> db{new StorageType(config)};
std::unique_ptr<memgraph::storage::Storage::Accessor> storage_dba{
db->Access(memgraph::replication::ReplicationRole::MAIN)};
db->Access(memgraph::replication_coordination_glue::ReplicationRole::MAIN)};
memgraph::query::DbAccessor dba{storage_dba.get()};
AstStorage storage;

View File

@ -29,7 +29,7 @@
#include "utils/exceptions.hpp"
#include "utils/memory.hpp"
using memgraph::replication::ReplicationRole;
using memgraph::replication_coordination_glue::ReplicationRole;
namespace {
const std::unordered_set<memgraph::query::TriggerEventType> kAllEventTypes{

Some files were not shown because too many files have changed in this diff Show More