Merge branch 'master' into text-search-integration-poc
This commit is contained in:
commit
d28f7ddea8
1
.github/ISSUE_TEMPLATE/bug_report.md
vendored
1
.github/ISSUE_TEMPLATE/bug_report.md
vendored
@ -3,7 +3,6 @@ name: Bug report
|
||||
about: Create a report to help us improve
|
||||
title: ""
|
||||
labels: bug
|
||||
assignees: gitbuda
|
||||
---
|
||||
|
||||
**Memgraph version**
|
||||
|
65
.github/workflows/diff.yaml
vendored
65
.github/workflows/diff.yaml
vendored
@ -383,71 +383,6 @@ jobs:
|
||||
# multiple paths could be defined
|
||||
build/logs
|
||||
|
||||
experimental_build_mt:
|
||||
name: "MultiTenancy replication build"
|
||||
runs-on: [self-hosted, Linux, X64, Diff]
|
||||
env:
|
||||
THREADS: 24
|
||||
MEMGRAPH_ENTERPRISE_LICENSE: ${{ secrets.MEMGRAPH_ENTERPRISE_LICENSE }}
|
||||
MEMGRAPH_ORGANIZATION_NAME: ${{ secrets.MEMGRAPH_ORGANIZATION_NAME }}
|
||||
|
||||
steps:
|
||||
- name: Set up repository
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
# Number of commits to fetch. `0` indicates all history for all
|
||||
# branches and tags. (default: 1)
|
||||
fetch-depth: 0
|
||||
|
||||
|
||||
- name: Build release binaries
|
||||
run: |
|
||||
# Activate toolchain.
|
||||
source /opt/toolchain-v4/activate
|
||||
|
||||
# Initialize dependencies.
|
||||
./init
|
||||
|
||||
# Build MT replication experimental binaries.
|
||||
cd build
|
||||
cmake -DCMAKE_BUILD_TYPE=Release -D MG_EXPERIMENTAL_REPLICATION_MULTITENANCY=ON ..
|
||||
make -j$THREADS
|
||||
|
||||
- name: Run unit tests
|
||||
run: |
|
||||
# Activate toolchain.
|
||||
source /opt/toolchain-v4/activate
|
||||
|
||||
# Run unit tests.
|
||||
cd build
|
||||
ctest -R memgraph__unit --output-on-failure -j$THREADS
|
||||
|
||||
- name: Run e2e tests
|
||||
if: false
|
||||
run: |
|
||||
cd tests
|
||||
./setup.sh /opt/toolchain-v4/activate
|
||||
source ve3/bin/activate_e2e
|
||||
cd e2e
|
||||
|
||||
# Just the replication based e2e tests
|
||||
./run.sh "Replicate multitenancy"
|
||||
./run.sh "Show"
|
||||
./run.sh "Show while creating invalid state"
|
||||
./run.sh "Delete edge replication"
|
||||
./run.sh "Read-write benchmark"
|
||||
./run.sh "Index replication"
|
||||
./run.sh "Constraints"
|
||||
|
||||
- name: Save test data
|
||||
uses: actions/upload-artifact@v4
|
||||
if: always()
|
||||
with:
|
||||
name: "Test data(MultiTenancy replication build)"
|
||||
path: |
|
||||
# multiple paths could be defined
|
||||
build/logs
|
||||
|
||||
release_jepsen_test:
|
||||
name: "Release Jepsen Test"
|
||||
runs-on: [self-hosted, Linux, X64, Debian10, JepsenControl]
|
||||
|
11
.github/workflows/release_debian10.yaml
vendored
11
.github/workflows/release_debian10.yaml
vendored
@ -1,4 +1,7 @@
|
||||
name: Release Debian 10
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.ref_name }}
|
||||
cancel-in-progress: true
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
@ -10,7 +13,12 @@ on:
|
||||
options:
|
||||
- Release
|
||||
- RelWithDebInfo
|
||||
|
||||
push:
|
||||
branches:
|
||||
- "release/**"
|
||||
tags:
|
||||
- "v*.*.*-rc*"
|
||||
- "v*.*-rc*"
|
||||
schedule:
|
||||
- cron: "0 22 * * *"
|
||||
|
||||
@ -321,7 +329,6 @@ jobs:
|
||||
--no-strict
|
||||
|
||||
release_e2e_test:
|
||||
if: false
|
||||
name: "Release End-to-end Test"
|
||||
runs-on: [self-hosted, Linux, X64, Debian10]
|
||||
timeout-minutes: 60
|
||||
|
11
.github/workflows/release_ubuntu2004.yaml
vendored
11
.github/workflows/release_ubuntu2004.yaml
vendored
@ -1,4 +1,7 @@
|
||||
name: Release Ubuntu 20.04
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.ref_name }}
|
||||
cancel-in-progress: true
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
@ -10,7 +13,12 @@ on:
|
||||
options:
|
||||
- Release
|
||||
- RelWithDebInfo
|
||||
|
||||
push:
|
||||
branches:
|
||||
- "release/**"
|
||||
tags:
|
||||
- "v*.*.*-rc*"
|
||||
- "v*.*-rc*"
|
||||
schedule:
|
||||
- cron: "0 22 * * *"
|
||||
|
||||
@ -317,7 +325,6 @@ jobs:
|
||||
--no-strict
|
||||
|
||||
release_e2e_test:
|
||||
if: false
|
||||
name: "Release End-to-end Test"
|
||||
runs-on: [self-hosted, Linux, X64, Ubuntu20.04]
|
||||
timeout-minutes: 60
|
||||
|
8
.github/workflows/stress_test_large.yaml
vendored
8
.github/workflows/stress_test_large.yaml
vendored
@ -1,4 +1,7 @@
|
||||
name: Stress test large
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.ref_name }}
|
||||
cancel-in-progress: true
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
@ -10,7 +13,10 @@ on:
|
||||
options:
|
||||
- Release
|
||||
- RelWithDebInfo
|
||||
|
||||
push:
|
||||
tags:
|
||||
- "v*.*.*-rc*"
|
||||
- "v*.*-rc*"
|
||||
schedule:
|
||||
- cron: "0 22 * * *"
|
||||
|
||||
|
@ -291,16 +291,6 @@ option(TSAN "Build with Thread Sanitizer. To get a reasonable performance option
|
||||
option(UBSAN "Build with Undefined Behaviour Sanitizer" OFF)
|
||||
|
||||
# Build feature flags
|
||||
option(MG_EXPERIMENTAL_REPLICATION_MULTITENANCY "Feature flag for experimental replicaition of multitenacy" OFF)
|
||||
|
||||
if (NOT MG_ENTERPRISE AND MG_EXPERIMENTAL_REPLICATION_MULTITENANCY)
|
||||
set(MG_EXPERIMENTAL_REPLICATION_MULTITENANCY OFF)
|
||||
message(FATAL_ERROR "MG_EXPERIMENTAL_REPLICATION_MULTITENANCY with community edition build isn't possible")
|
||||
endif ()
|
||||
|
||||
if (MG_EXPERIMENTAL_REPLICATION_MULTITENANCY)
|
||||
add_compile_definitions(MG_EXPERIMENTAL_REPLICATION_MULTITENANCY)
|
||||
endif ()
|
||||
|
||||
if (TEST_COVERAGE)
|
||||
string(TOLOWER ${CMAKE_BUILD_TYPE} lower_build_type)
|
||||
|
@ -9,13 +9,13 @@ target_sources(mg-coordination
|
||||
include/coordination/coordinator_config.hpp
|
||||
include/coordination/coordinator_exceptions.hpp
|
||||
include/coordination/coordinator_slk.hpp
|
||||
include/coordination/coordinator_data.hpp
|
||||
include/coordination/constants.hpp
|
||||
include/coordination/coordinator_cluster_config.hpp
|
||||
include/coordination/coordinator_handlers.hpp
|
||||
include/coordination/coordinator_instance.hpp
|
||||
include/coordination/coordinator_handlers.hpp
|
||||
include/coordination/constants.hpp
|
||||
include/coordination/instance_status.hpp
|
||||
include/coordination/replication_instance.hpp
|
||||
include/coordination/raft_state.hpp
|
||||
include/coordination/rpc_errors.hpp
|
||||
|
||||
include/nuraft/coordinator_log_store.hpp
|
||||
include/nuraft/coordinator_state_machine.hpp
|
||||
@ -26,10 +26,10 @@ target_sources(mg-coordination
|
||||
coordinator_state.cpp
|
||||
coordinator_rpc.cpp
|
||||
coordinator_server.cpp
|
||||
coordinator_data.cpp
|
||||
coordinator_instance.cpp
|
||||
coordinator_handlers.cpp
|
||||
coordinator_instance.cpp
|
||||
replication_instance.cpp
|
||||
raft_state.cpp
|
||||
|
||||
coordinator_log_store.cpp
|
||||
coordinator_state_machine.cpp
|
||||
|
@ -17,6 +17,7 @@
|
||||
#include "coordination/coordinator_config.hpp"
|
||||
#include "coordination/coordinator_rpc.hpp"
|
||||
#include "replication_coordination_glue/messages.hpp"
|
||||
#include "utils/result.hpp"
|
||||
|
||||
namespace memgraph::coordination {
|
||||
|
||||
@ -28,25 +29,38 @@ auto CreateClientContext(memgraph::coordination::CoordinatorClientConfig const &
|
||||
}
|
||||
} // namespace
|
||||
|
||||
CoordinatorClient::CoordinatorClient(CoordinatorData *coord_data, CoordinatorClientConfig config,
|
||||
CoordinatorClient::CoordinatorClient(CoordinatorInstance *coord_instance, CoordinatorClientConfig config,
|
||||
HealthCheckCallback succ_cb, HealthCheckCallback fail_cb)
|
||||
: rpc_context_{CreateClientContext(config)},
|
||||
rpc_client_{io::network::Endpoint(io::network::Endpoint::needs_resolving, config.ip_address, config.port),
|
||||
&rpc_context_},
|
||||
config_{std::move(config)},
|
||||
coord_data_{coord_data},
|
||||
coord_instance_{coord_instance},
|
||||
succ_cb_{std::move(succ_cb)},
|
||||
fail_cb_{std::move(fail_cb)} {}
|
||||
|
||||
auto CoordinatorClient::InstanceName() const -> std::string { return config_.instance_name; }
|
||||
auto CoordinatorClient::SocketAddress() const -> std::string { return rpc_client_.Endpoint().SocketAddress(); }
|
||||
|
||||
auto CoordinatorClient::InstanceDownTimeoutSec() const -> std::chrono::seconds {
|
||||
return config_.instance_down_timeout_sec;
|
||||
}
|
||||
|
||||
auto CoordinatorClient::InstanceGetUUIDFrequencySec() const -> std::chrono::seconds {
|
||||
return config_.instance_get_uuid_frequency_sec;
|
||||
}
|
||||
|
||||
void CoordinatorClient::StartFrequentCheck() {
|
||||
MG_ASSERT(config_.health_check_frequency_sec > std::chrono::seconds(0),
|
||||
if (instance_checker_.IsRunning()) {
|
||||
return;
|
||||
}
|
||||
|
||||
MG_ASSERT(config_.instance_health_check_frequency_sec > std::chrono::seconds(0),
|
||||
"Health check frequency must be greater than 0");
|
||||
|
||||
instance_checker_.Run(
|
||||
config_.instance_name, config_.health_check_frequency_sec, [this, instance_name = config_.instance_name] {
|
||||
config_.instance_name, config_.instance_health_check_frequency_sec,
|
||||
[this, instance_name = config_.instance_name] {
|
||||
try {
|
||||
spdlog::trace("Sending frequent heartbeat to machine {} on {}", instance_name,
|
||||
rpc_client_.Endpoint().SocketAddress());
|
||||
@ -54,9 +68,9 @@ void CoordinatorClient::StartFrequentCheck() {
|
||||
auto stream{rpc_client_.Stream<memgraph::replication_coordination_glue::FrequentHeartbeatRpc>()};
|
||||
stream.AwaitResponse();
|
||||
}
|
||||
succ_cb_(coord_data_, instance_name);
|
||||
succ_cb_(coord_instance_, instance_name);
|
||||
} catch (rpc::RpcFailedException const &) {
|
||||
fail_cb_(coord_data_, instance_name);
|
||||
fail_cb_(coord_instance_, instance_name);
|
||||
}
|
||||
});
|
||||
}
|
||||
@ -117,5 +131,45 @@ auto CoordinatorClient::SendSwapMainUUIDRpc(const utils::UUID &uuid) const -> bo
|
||||
return false;
|
||||
}
|
||||
|
||||
auto CoordinatorClient::SendUnregisterReplicaRpc(std::string const &instance_name) const -> bool {
|
||||
try {
|
||||
auto stream{rpc_client_.Stream<UnregisterReplicaRpc>(instance_name)};
|
||||
if (!stream.AwaitResponse().success) {
|
||||
spdlog::error("Failed to receive successful RPC response for unregistering replica!");
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
} catch (rpc::RpcFailedException const &) {
|
||||
spdlog::error("Failed to unregister replica!");
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
auto CoordinatorClient::SendGetInstanceUUIDRpc() const
|
||||
-> utils::BasicResult<GetInstanceUUIDError, std::optional<utils::UUID>> {
|
||||
try {
|
||||
auto stream{rpc_client_.Stream<GetInstanceUUIDRpc>()};
|
||||
auto res = stream.AwaitResponse();
|
||||
return res.uuid;
|
||||
} catch (const rpc::RpcFailedException &) {
|
||||
spdlog::error("RPC error occured while sending GetInstance UUID RPC");
|
||||
return GetInstanceUUIDError::RPC_EXCEPTION;
|
||||
}
|
||||
}
|
||||
|
||||
auto CoordinatorClient::SendEnableWritingOnMainRpc() const -> bool {
|
||||
try {
|
||||
auto stream{rpc_client_.Stream<EnableWritingOnMainRpc>()};
|
||||
if (!stream.AwaitResponse().success) {
|
||||
spdlog::error("Failed to receive successful RPC response for enabling writing on main!");
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
} catch (rpc::RpcFailedException const &) {
|
||||
spdlog::error("Failed to enable writing on main!");
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
} // namespace memgraph::coordination
|
||||
#endif
|
||||
|
@ -1,282 +0,0 @@
|
||||
// Copyright 2024 Memgraph Ltd.
|
||||
//
|
||||
// Use of this software is governed by the Business Source License
|
||||
// included in the file licenses/BSL.txt; by using this file, you agree to be bound by the terms of the Business Source
|
||||
// License, and you may not use this file except in compliance with the Business Source License.
|
||||
//
|
||||
// As of the Change Date specified in that file, in accordance with
|
||||
// the Business Source License, use of this software will be governed
|
||||
// by the Apache License, Version 2.0, included in the file
|
||||
// licenses/APL.txt.
|
||||
|
||||
#ifdef MG_ENTERPRISE
|
||||
|
||||
#include "coordination/coordinator_data.hpp"
|
||||
|
||||
#include "coordination/register_main_replica_coordinator_status.hpp"
|
||||
#include "coordination/replication_instance.hpp"
|
||||
#include "utils/uuid.hpp"
|
||||
|
||||
#include <range/v3/view.hpp>
|
||||
#include <shared_mutex>
|
||||
|
||||
namespace memgraph::coordination {
|
||||
|
||||
using nuraft::ptr;
|
||||
using nuraft::srv_config;
|
||||
|
||||
CoordinatorData::CoordinatorData() {
|
||||
auto find_instance = [](CoordinatorData *coord_data, std::string_view instance_name) -> ReplicationInstance & {
|
||||
auto instance = std::ranges::find_if(
|
||||
coord_data->repl_instances_,
|
||||
[instance_name](ReplicationInstance const &instance) { return instance.InstanceName() == instance_name; });
|
||||
|
||||
MG_ASSERT(instance != coord_data->repl_instances_.end(), "Instance {} not found during callback!", instance_name);
|
||||
return *instance;
|
||||
};
|
||||
|
||||
replica_succ_cb_ = [this, find_instance](CoordinatorData *coord_data, std::string_view instance_name) -> void {
|
||||
auto lock = std::lock_guard{coord_data->coord_data_lock_};
|
||||
spdlog::trace("Instance {} performing replica successful callback", instance_name);
|
||||
auto &instance = find_instance(coord_data, instance_name);
|
||||
|
||||
if (!instance.GetMainUUID().has_value() || main_uuid_ != instance.GetMainUUID().value()) {
|
||||
if (!instance.SendSwapAndUpdateUUID(main_uuid_)) {
|
||||
spdlog::error(
|
||||
fmt::format("Failed to swap uuid for replica instance {} which is alive", instance.InstanceName()));
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
instance.OnSuccessPing();
|
||||
};
|
||||
|
||||
replica_fail_cb_ = [find_instance](CoordinatorData *coord_data, std::string_view instance_name) -> void {
|
||||
auto lock = std::lock_guard{coord_data->coord_data_lock_};
|
||||
spdlog::trace("Instance {} performing replica failure callback", instance_name);
|
||||
auto &instance = find_instance(coord_data, instance_name);
|
||||
instance.OnFailPing();
|
||||
// We need to restart main uuid from instance since it was "down" at least a second
|
||||
// There is slight delay, if we choose to use isAlive, instance can be down and back up in less than
|
||||
// our isAlive time difference, which would lead to instance setting UUID to nullopt and stopping accepting any
|
||||
// incoming RPCs from valid main
|
||||
// TODO(antoniofilipovic) this needs here more complex logic
|
||||
// We need to get id of main replica is listening to on successful ping
|
||||
// and swap it to correct uuid if it failed
|
||||
instance.SetNewMainUUID();
|
||||
};
|
||||
|
||||
main_succ_cb_ = [this, find_instance](CoordinatorData *coord_data, std::string_view instance_name) -> void {
|
||||
auto lock = std::lock_guard{coord_data->coord_data_lock_};
|
||||
spdlog::trace("Instance {} performing main successful callback", instance_name);
|
||||
|
||||
auto &instance = find_instance(coord_data, instance_name);
|
||||
|
||||
if (instance.IsAlive()) {
|
||||
instance.OnSuccessPing();
|
||||
return;
|
||||
}
|
||||
|
||||
const auto &instance_uuid = instance.GetMainUUID();
|
||||
MG_ASSERT(instance_uuid.has_value(), "Instance must have uuid set");
|
||||
if (main_uuid_ == instance_uuid.value()) {
|
||||
instance.OnSuccessPing();
|
||||
return;
|
||||
}
|
||||
|
||||
// TODO(antoniof) make demoteToReplica idempotent since main can be demoted to replica but
|
||||
// swapUUID can fail
|
||||
bool const demoted = instance.DemoteToReplica(coord_data->replica_succ_cb_, coord_data->replica_fail_cb_);
|
||||
if (demoted) {
|
||||
instance.OnSuccessPing();
|
||||
spdlog::info("Instance {} demoted to replica", instance_name);
|
||||
} else {
|
||||
spdlog::error("Instance {} failed to become replica", instance_name);
|
||||
return;
|
||||
}
|
||||
|
||||
if (!instance.SendSwapAndUpdateUUID(main_uuid_)) {
|
||||
spdlog::error(fmt::format("Failed to swap uuid for demoted main instance {}", instance.InstanceName()));
|
||||
return;
|
||||
}
|
||||
};
|
||||
|
||||
main_fail_cb_ = [this, find_instance](CoordinatorData *coord_data, std::string_view instance_name) -> void {
|
||||
auto lock = std::lock_guard{coord_data->coord_data_lock_};
|
||||
spdlog::trace("Instance {} performing main failure callback", instance_name);
|
||||
auto &instance = find_instance(coord_data, instance_name);
|
||||
instance.OnFailPing();
|
||||
const auto &instance_uuid = instance.GetMainUUID();
|
||||
MG_ASSERT(instance_uuid.has_value(), "Instance must have uuid set");
|
||||
|
||||
if (!instance.IsAlive() && main_uuid_ == instance_uuid.value()) {
|
||||
spdlog::info("Cluster without main instance, trying automatic failover");
|
||||
coord_data->TryFailover();
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
auto CoordinatorData::TryFailover() -> void {
|
||||
auto alive_replicas = repl_instances_ | ranges::views::filter(&ReplicationInstance::IsReplica) |
|
||||
ranges::views::filter(&ReplicationInstance::IsAlive);
|
||||
|
||||
if (ranges::empty(alive_replicas)) {
|
||||
spdlog::warn("Failover failed since all replicas are down!");
|
||||
return;
|
||||
}
|
||||
|
||||
// TODO: Smarter choice
|
||||
auto chosen_replica_instance = ranges::begin(alive_replicas);
|
||||
|
||||
chosen_replica_instance->PauseFrequentCheck();
|
||||
utils::OnScopeExit scope_exit{[&chosen_replica_instance] { chosen_replica_instance->ResumeFrequentCheck(); }};
|
||||
|
||||
auto const potential_new_main_uuid = utils::UUID{};
|
||||
|
||||
auto const is_not_chosen_replica_instance = [&chosen_replica_instance](ReplicationInstance &instance) {
|
||||
return instance != *chosen_replica_instance;
|
||||
};
|
||||
|
||||
// If for some replicas swap fails, for others on successful ping we will revert back on next change
|
||||
// or we will do failover first again and then it will be consistent again
|
||||
for (auto &other_replica_instance : alive_replicas | ranges::views::filter(is_not_chosen_replica_instance)) {
|
||||
if (!other_replica_instance.SendSwapAndUpdateUUID(potential_new_main_uuid)) {
|
||||
spdlog::error(fmt::format("Failed to swap uuid for instance {} which is alive, aborting failover",
|
||||
other_replica_instance.InstanceName()));
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<ReplClientInfo> repl_clients_info;
|
||||
repl_clients_info.reserve(repl_instances_.size() - 1);
|
||||
std::ranges::transform(repl_instances_ | ranges::views::filter(is_not_chosen_replica_instance),
|
||||
std::back_inserter(repl_clients_info), &ReplicationInstance::ReplicationClientInfo);
|
||||
|
||||
if (!chosen_replica_instance->PromoteToMain(potential_new_main_uuid, std::move(repl_clients_info), main_succ_cb_,
|
||||
main_fail_cb_)) {
|
||||
spdlog::warn("Failover failed since promoting replica to main failed!");
|
||||
return;
|
||||
}
|
||||
chosen_replica_instance->SetNewMainUUID(potential_new_main_uuid);
|
||||
main_uuid_ = potential_new_main_uuid;
|
||||
|
||||
spdlog::info("Failover successful! Instance {} promoted to main.", chosen_replica_instance->InstanceName());
|
||||
}
|
||||
|
||||
auto CoordinatorData::ShowInstances() const -> std::vector<InstanceStatus> {
|
||||
auto const coord_instances = self_.GetAllCoordinators();
|
||||
|
||||
std::vector<InstanceStatus> instances_status;
|
||||
instances_status.reserve(repl_instances_.size() + coord_instances.size());
|
||||
|
||||
auto const stringify_repl_role = [](ReplicationInstance const &instance) -> std::string {
|
||||
if (!instance.IsAlive()) return "unknown";
|
||||
if (instance.IsMain()) return "main";
|
||||
return "replica";
|
||||
};
|
||||
|
||||
auto const repl_instance_to_status = [&stringify_repl_role](ReplicationInstance const &instance) -> InstanceStatus {
|
||||
return {.instance_name = instance.InstanceName(),
|
||||
.coord_socket_address = instance.SocketAddress(),
|
||||
.cluster_role = stringify_repl_role(instance),
|
||||
.is_alive = instance.IsAlive()};
|
||||
};
|
||||
|
||||
auto const coord_instance_to_status = [](ptr<srv_config> const &instance) -> InstanceStatus {
|
||||
return {.instance_name = "coordinator_" + std::to_string(instance->get_id()),
|
||||
.raft_socket_address = instance->get_endpoint(),
|
||||
.cluster_role = "coordinator",
|
||||
.is_alive = true}; // TODO: (andi) Get this info from RAFT and test it or when we will move
|
||||
// CoordinatorState to every instance, we can be smarter about this using our RPC.
|
||||
};
|
||||
|
||||
std::ranges::transform(coord_instances, std::back_inserter(instances_status), coord_instance_to_status);
|
||||
|
||||
{
|
||||
auto lock = std::shared_lock{coord_data_lock_};
|
||||
std::ranges::transform(repl_instances_, std::back_inserter(instances_status), repl_instance_to_status);
|
||||
}
|
||||
|
||||
return instances_status;
|
||||
}
|
||||
|
||||
// TODO: (andi) Make sure you cannot put coordinator instance to the main
|
||||
auto CoordinatorData::SetInstanceToMain(std::string instance_name) -> SetInstanceToMainCoordinatorStatus {
|
||||
auto lock = std::lock_guard{coord_data_lock_};
|
||||
|
||||
auto const is_new_main = [&instance_name](ReplicationInstance const &instance) {
|
||||
return instance.InstanceName() == instance_name;
|
||||
};
|
||||
auto new_main = std::ranges::find_if(repl_instances_, is_new_main);
|
||||
|
||||
if (new_main == repl_instances_.end()) {
|
||||
spdlog::error("Instance {} not registered. Please register it using REGISTER INSTANCE {}", instance_name,
|
||||
instance_name);
|
||||
return SetInstanceToMainCoordinatorStatus::NO_INSTANCE_WITH_NAME;
|
||||
}
|
||||
|
||||
new_main->PauseFrequentCheck();
|
||||
utils::OnScopeExit scope_exit{[&new_main] { new_main->ResumeFrequentCheck(); }};
|
||||
|
||||
ReplicationClientsInfo repl_clients_info;
|
||||
repl_clients_info.reserve(repl_instances_.size() - 1);
|
||||
|
||||
auto const is_not_new_main = [&instance_name](ReplicationInstance const &instance) {
|
||||
return instance.InstanceName() != instance_name;
|
||||
};
|
||||
|
||||
auto potential_new_main_uuid = utils::UUID{};
|
||||
spdlog::trace("Generated potential new main uuid");
|
||||
|
||||
for (auto &other_instance : repl_instances_ | ranges::views::filter(is_not_new_main)) {
|
||||
if (!other_instance.SendSwapAndUpdateUUID(potential_new_main_uuid)) {
|
||||
spdlog::error(
|
||||
fmt::format("Failed to swap uuid for instance {}, aborting failover", other_instance.InstanceName()));
|
||||
return SetInstanceToMainCoordinatorStatus::SWAP_UUID_FAILED;
|
||||
}
|
||||
}
|
||||
|
||||
std::ranges::transform(repl_instances_ | ranges::views::filter(is_not_new_main),
|
||||
std::back_inserter(repl_clients_info),
|
||||
[](const ReplicationInstance &instance) { return instance.ReplicationClientInfo(); });
|
||||
|
||||
if (!new_main->PromoteToMain(potential_new_main_uuid, std::move(repl_clients_info), main_succ_cb_, main_fail_cb_)) {
|
||||
return SetInstanceToMainCoordinatorStatus::COULD_NOT_PROMOTE_TO_MAIN;
|
||||
}
|
||||
|
||||
new_main->SetNewMainUUID(potential_new_main_uuid);
|
||||
main_uuid_ = potential_new_main_uuid;
|
||||
spdlog::info("Instance {} promoted to main", instance_name);
|
||||
return SetInstanceToMainCoordinatorStatus::SUCCESS;
|
||||
}
|
||||
|
||||
auto CoordinatorData::RegisterInstance(CoordinatorClientConfig config) -> RegisterInstanceCoordinatorStatus {
|
||||
auto lock = std::lock_guard{coord_data_lock_};
|
||||
if (std::ranges::any_of(repl_instances_, [&config](ReplicationInstance const &instance) {
|
||||
return instance.InstanceName() == config.instance_name;
|
||||
})) {
|
||||
return RegisterInstanceCoordinatorStatus::NAME_EXISTS;
|
||||
}
|
||||
|
||||
if (std::ranges::any_of(repl_instances_, [&config](ReplicationInstance const &instance) {
|
||||
return instance.SocketAddress() == config.SocketAddress();
|
||||
})) {
|
||||
return RegisterInstanceCoordinatorStatus::ENDPOINT_EXISTS;
|
||||
}
|
||||
|
||||
try {
|
||||
repl_instances_.emplace_back(this, std::move(config), replica_succ_cb_, replica_fail_cb_);
|
||||
return RegisterInstanceCoordinatorStatus::SUCCESS;
|
||||
|
||||
} catch (CoordinatorRegisterInstanceException const &) {
|
||||
return RegisterInstanceCoordinatorStatus::RPC_FAILED;
|
||||
}
|
||||
}
|
||||
|
||||
auto CoordinatorData::AddCoordinatorInstance(uint32_t raft_server_id, uint32_t raft_port, std::string raft_address)
|
||||
-> void {
|
||||
self_.AddCoordinatorInstance(raft_server_id, raft_port, std::move(raft_address));
|
||||
}
|
||||
|
||||
} // namespace memgraph::coordination
|
||||
#endif
|
@ -39,6 +39,24 @@ void CoordinatorHandlers::Register(memgraph::coordination::CoordinatorServer &se
|
||||
spdlog::info("Received SwapMainUUIDRPC on coordinator server");
|
||||
CoordinatorHandlers::SwapMainUUIDHandler(replication_handler, req_reader, res_builder);
|
||||
});
|
||||
|
||||
server.Register<coordination::UnregisterReplicaRpc>(
|
||||
[&replication_handler](slk::Reader *req_reader, slk::Builder *res_builder) -> void {
|
||||
spdlog::info("Received UnregisterReplicaRpc on coordinator server");
|
||||
CoordinatorHandlers::UnregisterReplicaHandler(replication_handler, req_reader, res_builder);
|
||||
});
|
||||
|
||||
server.Register<coordination::EnableWritingOnMainRpc>(
|
||||
[&replication_handler](slk::Reader *req_reader, slk::Builder *res_builder) -> void {
|
||||
spdlog::info("Received EnableWritingOnMainRpc on coordinator server");
|
||||
CoordinatorHandlers::EnableWritingOnMainHandler(replication_handler, req_reader, res_builder);
|
||||
});
|
||||
|
||||
server.Register<coordination::GetInstanceUUIDRpc>(
|
||||
[&replication_handler](slk::Reader *req_reader, slk::Builder *res_builder) -> void {
|
||||
spdlog::info("Received GetInstanceUUIDRpc on coordinator server");
|
||||
CoordinatorHandlers::GetInstanceUUIDHandler(replication_handler, req_reader, res_builder);
|
||||
});
|
||||
}
|
||||
|
||||
void CoordinatorHandlers::SwapMainUUIDHandler(replication::ReplicationHandler &replication_handler,
|
||||
@ -62,12 +80,6 @@ void CoordinatorHandlers::DemoteMainToReplicaHandler(replication::ReplicationHan
|
||||
slk::Reader *req_reader, slk::Builder *res_builder) {
|
||||
spdlog::info("Executing DemoteMainToReplicaHandler");
|
||||
|
||||
if (!replication_handler.IsMain()) {
|
||||
spdlog::error("Setting to replica must be performed on main.");
|
||||
slk::Save(coordination::DemoteMainToReplicaRes{false}, res_builder);
|
||||
return;
|
||||
}
|
||||
|
||||
coordination::DemoteMainToReplicaReq req;
|
||||
slk::Load(&req, req_reader);
|
||||
|
||||
@ -77,11 +89,18 @@ void CoordinatorHandlers::DemoteMainToReplicaHandler(replication::ReplicationHan
|
||||
|
||||
if (!replication_handler.SetReplicationRoleReplica(clients_config, std::nullopt)) {
|
||||
spdlog::error("Demoting main to replica failed!");
|
||||
slk::Save(coordination::PromoteReplicaToMainRes{false}, res_builder);
|
||||
slk::Save(coordination::DemoteMainToReplicaRes{false}, res_builder);
|
||||
return;
|
||||
}
|
||||
|
||||
slk::Save(coordination::PromoteReplicaToMainRes{true}, res_builder);
|
||||
slk::Save(coordination::DemoteMainToReplicaRes{true}, res_builder);
|
||||
}
|
||||
|
||||
void CoordinatorHandlers::GetInstanceUUIDHandler(replication::ReplicationHandler &replication_handler,
|
||||
slk::Reader * /*req_reader*/, slk::Builder *res_builder) {
|
||||
spdlog::info("Executing GetInstanceUUIDHandler");
|
||||
|
||||
slk::Save(coordination::GetInstanceUUIDRes{replication_handler.GetReplicaUUID()}, res_builder);
|
||||
}
|
||||
|
||||
void CoordinatorHandlers::PromoteReplicaToMainHandler(replication::ReplicationHandler &replication_handler,
|
||||
@ -142,9 +161,58 @@ void CoordinatorHandlers::PromoteReplicaToMainHandler(replication::ReplicationHa
|
||||
}
|
||||
}
|
||||
}
|
||||
spdlog::error(fmt::format("FICO : Promote replica to main was success {}", std::string(req.main_uuid_)));
|
||||
spdlog::info("Promote replica to main was success {}", std::string(req.main_uuid_));
|
||||
slk::Save(coordination::PromoteReplicaToMainRes{true}, res_builder);
|
||||
}
|
||||
|
||||
void CoordinatorHandlers::UnregisterReplicaHandler(replication::ReplicationHandler &replication_handler,
|
||||
slk::Reader *req_reader, slk::Builder *res_builder) {
|
||||
if (!replication_handler.IsMain()) {
|
||||
spdlog::error("Unregistering replica must be performed on main.");
|
||||
slk::Save(coordination::UnregisterReplicaRes{false}, res_builder);
|
||||
return;
|
||||
}
|
||||
|
||||
coordination::UnregisterReplicaReq req;
|
||||
slk::Load(&req, req_reader);
|
||||
|
||||
auto res = replication_handler.UnregisterReplica(req.instance_name);
|
||||
switch (res) {
|
||||
using enum memgraph::query::UnregisterReplicaResult;
|
||||
case SUCCESS:
|
||||
slk::Save(coordination::UnregisterReplicaRes{true}, res_builder);
|
||||
break;
|
||||
case NOT_MAIN:
|
||||
spdlog::error("Unregistering replica must be performed on main.");
|
||||
slk::Save(coordination::UnregisterReplicaRes{false}, res_builder);
|
||||
break;
|
||||
case CAN_NOT_UNREGISTER:
|
||||
spdlog::error("Could not unregister replica.");
|
||||
slk::Save(coordination::UnregisterReplicaRes{false}, res_builder);
|
||||
break;
|
||||
case COULD_NOT_BE_PERSISTED:
|
||||
spdlog::error("Could not persist replica unregistration.");
|
||||
slk::Save(coordination::UnregisterReplicaRes{false}, res_builder);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
void CoordinatorHandlers::EnableWritingOnMainHandler(replication::ReplicationHandler &replication_handler,
|
||||
slk::Reader * /*req_reader*/, slk::Builder *res_builder) {
|
||||
if (!replication_handler.IsMain()) {
|
||||
spdlog::error("Enable writing on main must be performed on main!");
|
||||
slk::Save(coordination::EnableWritingOnMainRes{false}, res_builder);
|
||||
return;
|
||||
}
|
||||
|
||||
if (!replication_handler.GetReplState().EnableWritingOnMain()) {
|
||||
spdlog::error("Enabling writing on main failed!");
|
||||
slk::Save(coordination::EnableWritingOnMainRes{false}, res_builder);
|
||||
return;
|
||||
}
|
||||
|
||||
slk::Save(coordination::EnableWritingOnMainRes{true}, res_builder);
|
||||
}
|
||||
|
||||
} // namespace memgraph::dbms
|
||||
#endif
|
||||
|
@ -17,82 +17,332 @@
|
||||
#include "nuraft/coordinator_state_machine.hpp"
|
||||
#include "nuraft/coordinator_state_manager.hpp"
|
||||
#include "utils/counter.hpp"
|
||||
#include "utils/functional.hpp"
|
||||
|
||||
#include <range/v3/view.hpp>
|
||||
#include <shared_mutex>
|
||||
|
||||
namespace memgraph::coordination {
|
||||
|
||||
using nuraft::asio_service;
|
||||
using nuraft::cmd_result;
|
||||
using nuraft::cs_new;
|
||||
using nuraft::ptr;
|
||||
using nuraft::raft_params;
|
||||
using nuraft::srv_config;
|
||||
using raft_result = cmd_result<ptr<buffer>>;
|
||||
|
||||
CoordinatorInstance::CoordinatorInstance()
|
||||
: raft_server_id_(FLAGS_raft_server_id), raft_port_(FLAGS_raft_server_port), raft_address_("127.0.0.1") {
|
||||
auto raft_endpoint = raft_address_ + ":" + std::to_string(raft_port_);
|
||||
state_manager_ = cs_new<CoordinatorStateManager>(raft_server_id_, raft_endpoint);
|
||||
state_machine_ = cs_new<CoordinatorStateMachine>();
|
||||
logger_ = nullptr;
|
||||
: raft_state_(RaftState::MakeRaftState(
|
||||
[this] { std::ranges::for_each(repl_instances_, &ReplicationInstance::StartFrequentCheck); },
|
||||
[this] { std::ranges::for_each(repl_instances_, &ReplicationInstance::StopFrequentCheck); })) {
|
||||
auto find_repl_instance = [](CoordinatorInstance *self,
|
||||
std::string_view repl_instance_name) -> ReplicationInstance & {
|
||||
auto repl_instance =
|
||||
std::ranges::find_if(self->repl_instances_, [repl_instance_name](ReplicationInstance const &instance) {
|
||||
return instance.InstanceName() == repl_instance_name;
|
||||
});
|
||||
|
||||
// ASIO options
|
||||
asio_service::options asio_opts;
|
||||
asio_opts.thread_pool_size_ = 1; // TODO: (andi) Improve this
|
||||
MG_ASSERT(repl_instance != self->repl_instances_.end(), "Instance {} not found during callback!",
|
||||
repl_instance_name);
|
||||
return *repl_instance;
|
||||
};
|
||||
|
||||
// RAFT parameters. Heartbeat every 100ms, election timeout between 200ms and 400ms.
|
||||
raft_params params;
|
||||
params.heart_beat_interval_ = 100;
|
||||
params.election_timeout_lower_bound_ = 200;
|
||||
params.election_timeout_upper_bound_ = 400;
|
||||
// 5 logs are preserved before the last snapshot
|
||||
params.reserved_log_items_ = 5;
|
||||
// Create snapshot for every 5 log appends
|
||||
params.snapshot_distance_ = 5;
|
||||
params.client_req_timeout_ = 3000;
|
||||
params.return_method_ = raft_params::blocking;
|
||||
replica_succ_cb_ = [find_repl_instance](CoordinatorInstance *self, std::string_view repl_instance_name) -> void {
|
||||
auto lock = std::lock_guard{self->coord_instance_lock_};
|
||||
spdlog::trace("Instance {} performing replica successful callback", repl_instance_name);
|
||||
auto &repl_instance = find_repl_instance(self, repl_instance_name);
|
||||
|
||||
raft_server_ =
|
||||
launcher_.init(state_machine_, state_manager_, logger_, static_cast<int>(raft_port_), asio_opts, params);
|
||||
// We need to get replicas UUID from time to time to ensure replica is listening to correct main
|
||||
// and that it didn't go down for less time than we could notice
|
||||
// We need to get id of main replica is listening to
|
||||
// and swap if necessary
|
||||
if (!repl_instance.EnsureReplicaHasCorrectMainUUID(self->GetMainUUID())) {
|
||||
spdlog::error("Failed to swap uuid for replica instance {} which is alive", repl_instance.InstanceName());
|
||||
return;
|
||||
}
|
||||
|
||||
if (!raft_server_) {
|
||||
throw RaftServerStartException("Failed to launch raft server on {}", raft_endpoint);
|
||||
}
|
||||
repl_instance.OnSuccessPing();
|
||||
};
|
||||
|
||||
auto maybe_stop = utils::ResettableCounter<20>();
|
||||
while (!raft_server_->is_initialized() && !maybe_stop()) {
|
||||
std::this_thread::sleep_for(std::chrono::milliseconds(250));
|
||||
}
|
||||
replica_fail_cb_ = [find_repl_instance](CoordinatorInstance *self, std::string_view repl_instance_name) -> void {
|
||||
auto lock = std::lock_guard{self->coord_instance_lock_};
|
||||
spdlog::trace("Instance {} performing replica failure callback", repl_instance_name);
|
||||
auto &repl_instance = find_repl_instance(self, repl_instance_name);
|
||||
repl_instance.OnFailPing();
|
||||
};
|
||||
|
||||
if (!raft_server_->is_initialized()) {
|
||||
throw RaftServerStartException("Failed to initialize raft server on {}", raft_endpoint);
|
||||
}
|
||||
main_succ_cb_ = [find_repl_instance](CoordinatorInstance *self, std::string_view repl_instance_name) -> void {
|
||||
auto lock = std::lock_guard{self->coord_instance_lock_};
|
||||
spdlog::trace("Instance {} performing main successful callback", repl_instance_name);
|
||||
|
||||
spdlog::info("Raft server started on {}", raft_endpoint);
|
||||
auto &repl_instance = find_repl_instance(self, repl_instance_name);
|
||||
|
||||
if (repl_instance.IsAlive()) {
|
||||
repl_instance.OnSuccessPing();
|
||||
return;
|
||||
}
|
||||
|
||||
const auto &repl_instance_uuid = repl_instance.GetMainUUID();
|
||||
MG_ASSERT(repl_instance_uuid.has_value(), "Instance must have uuid set.");
|
||||
|
||||
auto const curr_main_uuid = self->GetMainUUID();
|
||||
if (curr_main_uuid == repl_instance_uuid.value()) {
|
||||
if (!repl_instance.EnableWritingOnMain()) {
|
||||
spdlog::error("Failed to enable writing on main instance {}", repl_instance_name);
|
||||
return;
|
||||
}
|
||||
|
||||
repl_instance.OnSuccessPing();
|
||||
return;
|
||||
}
|
||||
|
||||
// TODO(antoniof) make demoteToReplica idempotent since main can be demoted to replica but
|
||||
// swapUUID can fail
|
||||
if (repl_instance.DemoteToReplica(self->replica_succ_cb_, self->replica_fail_cb_)) {
|
||||
repl_instance.OnSuccessPing();
|
||||
spdlog::info("Instance {} demoted to replica", repl_instance_name);
|
||||
} else {
|
||||
spdlog::error("Instance {} failed to become replica", repl_instance_name);
|
||||
return;
|
||||
}
|
||||
|
||||
if (!repl_instance.SendSwapAndUpdateUUID(curr_main_uuid)) {
|
||||
spdlog::error(fmt::format("Failed to swap uuid for demoted main instance {}", repl_instance.InstanceName()));
|
||||
return;
|
||||
}
|
||||
};
|
||||
|
||||
main_fail_cb_ = [find_repl_instance](CoordinatorInstance *self, std::string_view repl_instance_name) -> void {
|
||||
auto lock = std::lock_guard{self->coord_instance_lock_};
|
||||
spdlog::trace("Instance {} performing main failure callback", repl_instance_name);
|
||||
auto &repl_instance = find_repl_instance(self, repl_instance_name);
|
||||
repl_instance.OnFailPing();
|
||||
const auto &repl_instance_uuid = repl_instance.GetMainUUID();
|
||||
MG_ASSERT(repl_instance_uuid.has_value(), "Instance must have uuid set");
|
||||
|
||||
if (!repl_instance.IsAlive() && self->GetMainUUID() == repl_instance_uuid.value()) {
|
||||
spdlog::info("Cluster without main instance, trying automatic failover");
|
||||
self->TryFailover(); // TODO: (andi) Initiate failover
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
auto CoordinatorInstance::InstanceName() const -> std::string {
|
||||
return "coordinator_" + std::to_string(raft_server_id_);
|
||||
auto CoordinatorInstance::ShowInstances() const -> std::vector<InstanceStatus> {
|
||||
auto const coord_instances = raft_state_.GetAllCoordinators();
|
||||
|
||||
auto const stringify_repl_role = [](ReplicationInstance const &instance) -> std::string {
|
||||
if (!instance.IsAlive()) return "unknown";
|
||||
if (instance.IsMain()) return "main";
|
||||
return "replica";
|
||||
};
|
||||
|
||||
auto const repl_instance_to_status = [&stringify_repl_role](ReplicationInstance const &instance) -> InstanceStatus {
|
||||
return {.instance_name = instance.InstanceName(),
|
||||
.coord_socket_address = instance.SocketAddress(),
|
||||
.cluster_role = stringify_repl_role(instance),
|
||||
.is_alive = instance.IsAlive()};
|
||||
};
|
||||
|
||||
auto const coord_instance_to_status = [](ptr<srv_config> const &instance) -> InstanceStatus {
|
||||
return {.instance_name = "coordinator_" + std::to_string(instance->get_id()),
|
||||
.raft_socket_address = instance->get_endpoint(),
|
||||
.cluster_role = "coordinator",
|
||||
.is_alive = true}; // TODO: (andi) Get this info from RAFT and test it or when we will move
|
||||
// CoordinatorState to every instance, we can be smarter about this using our RPC.
|
||||
};
|
||||
|
||||
auto instances_status = utils::fmap(coord_instance_to_status, coord_instances);
|
||||
{
|
||||
auto lock = std::shared_lock{coord_instance_lock_};
|
||||
std::ranges::transform(repl_instances_, std::back_inserter(instances_status), repl_instance_to_status);
|
||||
}
|
||||
|
||||
return instances_status;
|
||||
}
|
||||
|
||||
auto CoordinatorInstance::RaftSocketAddress() const -> std::string {
|
||||
return raft_address_ + ":" + std::to_string(raft_port_);
|
||||
auto CoordinatorInstance::TryFailover() -> void {
|
||||
auto alive_replicas = repl_instances_ | ranges::views::filter(&ReplicationInstance::IsReplica) |
|
||||
ranges::views::filter(&ReplicationInstance::IsAlive);
|
||||
|
||||
if (ranges::empty(alive_replicas)) {
|
||||
spdlog::warn("Failover failed since all replicas are down!");
|
||||
return;
|
||||
}
|
||||
|
||||
// TODO: Smarter choice
|
||||
auto new_main = ranges::begin(alive_replicas);
|
||||
|
||||
new_main->PauseFrequentCheck();
|
||||
utils::OnScopeExit scope_exit{[&new_main] { new_main->ResumeFrequentCheck(); }};
|
||||
|
||||
auto const is_not_new_main = [&new_main](ReplicationInstance &instance) {
|
||||
return instance.InstanceName() != new_main->InstanceName();
|
||||
};
|
||||
|
||||
auto const new_main_uuid = utils::UUID{};
|
||||
// If for some replicas swap fails, for others on successful ping we will revert back on next change
|
||||
// or we will do failover first again and then it will be consistent again
|
||||
for (auto &other_replica_instance : alive_replicas | ranges::views::filter(is_not_new_main)) {
|
||||
if (!other_replica_instance.SendSwapAndUpdateUUID(new_main_uuid)) {
|
||||
spdlog::error(fmt::format("Failed to swap uuid for instance {} which is alive, aborting failover",
|
||||
other_replica_instance.InstanceName()));
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: (andi) fmap compliant
|
||||
ReplicationClientsInfo repl_clients_info;
|
||||
repl_clients_info.reserve(repl_instances_.size() - 1);
|
||||
std::ranges::transform(repl_instances_ | ranges::views::filter(is_not_new_main),
|
||||
std::back_inserter(repl_clients_info), &ReplicationInstance::ReplicationClientInfo);
|
||||
|
||||
if (!new_main->PromoteToMain(new_main_uuid, std::move(repl_clients_info), main_succ_cb_, main_fail_cb_)) {
|
||||
spdlog::warn("Failover failed since promoting replica to main failed!");
|
||||
return;
|
||||
}
|
||||
// TODO: (andi) This should be replicated across all coordinator instances with Raft log
|
||||
SetMainUUID(new_main_uuid);
|
||||
spdlog::info("Failover successful! Instance {} promoted to main.", new_main->InstanceName());
|
||||
}
|
||||
|
||||
// TODO: (andi) Make sure you cannot put coordinator instance to the main
|
||||
auto CoordinatorInstance::SetReplicationInstanceToMain(std::string instance_name)
|
||||
-> SetInstanceToMainCoordinatorStatus {
|
||||
auto lock = std::lock_guard{coord_instance_lock_};
|
||||
|
||||
if (std::ranges::any_of(repl_instances_, &ReplicationInstance::IsMain)) {
|
||||
return SetInstanceToMainCoordinatorStatus::MAIN_ALREADY_EXISTS;
|
||||
}
|
||||
|
||||
auto const is_new_main = [&instance_name](ReplicationInstance const &instance) {
|
||||
return instance.InstanceName() == instance_name;
|
||||
};
|
||||
auto new_main = std::ranges::find_if(repl_instances_, is_new_main);
|
||||
|
||||
if (new_main == repl_instances_.end()) {
|
||||
spdlog::error("Instance {} not registered. Please register it using REGISTER INSTANCE {}", instance_name,
|
||||
instance_name);
|
||||
return SetInstanceToMainCoordinatorStatus::NO_INSTANCE_WITH_NAME;
|
||||
}
|
||||
|
||||
new_main->PauseFrequentCheck();
|
||||
utils::OnScopeExit scope_exit{[&new_main] { new_main->ResumeFrequentCheck(); }};
|
||||
|
||||
auto const is_not_new_main = [&instance_name](ReplicationInstance const &instance) {
|
||||
return instance.InstanceName() != instance_name;
|
||||
};
|
||||
|
||||
auto const new_main_uuid = utils::UUID{};
|
||||
|
||||
for (auto &other_instance : repl_instances_ | ranges::views::filter(is_not_new_main)) {
|
||||
if (!other_instance.SendSwapAndUpdateUUID(new_main_uuid)) {
|
||||
spdlog::error(
|
||||
fmt::format("Failed to swap uuid for instance {}, aborting failover", other_instance.InstanceName()));
|
||||
return SetInstanceToMainCoordinatorStatus::SWAP_UUID_FAILED;
|
||||
}
|
||||
}
|
||||
|
||||
ReplicationClientsInfo repl_clients_info;
|
||||
repl_clients_info.reserve(repl_instances_.size() - 1);
|
||||
std::ranges::transform(repl_instances_ | ranges::views::filter(is_not_new_main),
|
||||
std::back_inserter(repl_clients_info), &ReplicationInstance::ReplicationClientInfo);
|
||||
|
||||
if (!new_main->PromoteToMain(new_main_uuid, std::move(repl_clients_info), main_succ_cb_, main_fail_cb_)) {
|
||||
return SetInstanceToMainCoordinatorStatus::COULD_NOT_PROMOTE_TO_MAIN;
|
||||
}
|
||||
|
||||
// TODO: (andi) This should be replicated across all coordinator instances with Raft log
|
||||
SetMainUUID(new_main_uuid);
|
||||
spdlog::info("Instance {} promoted to main", instance_name);
|
||||
return SetInstanceToMainCoordinatorStatus::SUCCESS;
|
||||
}
|
||||
|
||||
auto CoordinatorInstance::RegisterReplicationInstance(CoordinatorClientConfig config)
|
||||
-> RegisterInstanceCoordinatorStatus {
|
||||
auto lock = std::lock_guard{coord_instance_lock_};
|
||||
|
||||
auto instance_name = config.instance_name;
|
||||
|
||||
auto const name_matches = [&instance_name](ReplicationInstance const &instance) {
|
||||
return instance.InstanceName() == instance_name;
|
||||
};
|
||||
|
||||
if (std::ranges::any_of(repl_instances_, name_matches)) {
|
||||
return RegisterInstanceCoordinatorStatus::NAME_EXISTS;
|
||||
}
|
||||
|
||||
auto const socket_address_matches = [&config](ReplicationInstance const &instance) {
|
||||
return instance.SocketAddress() == config.SocketAddress();
|
||||
};
|
||||
|
||||
if (std::ranges::any_of(repl_instances_, socket_address_matches)) {
|
||||
return RegisterInstanceCoordinatorStatus::ENDPOINT_EXISTS;
|
||||
}
|
||||
|
||||
if (!raft_state_.RequestLeadership()) {
|
||||
return RegisterInstanceCoordinatorStatus::NOT_LEADER;
|
||||
}
|
||||
|
||||
auto const res = raft_state_.AppendRegisterReplicationInstance(instance_name);
|
||||
if (!res->get_accepted()) {
|
||||
spdlog::error(
|
||||
"Failed to accept request for registering instance {}. Most likely the reason is that the instance is not "
|
||||
"the "
|
||||
"leader.",
|
||||
config.instance_name);
|
||||
return RegisterInstanceCoordinatorStatus::RAFT_COULD_NOT_ACCEPT;
|
||||
}
|
||||
|
||||
spdlog::info("Request for registering instance {} accepted", instance_name);
|
||||
try {
|
||||
repl_instances_.emplace_back(this, std::move(config), replica_succ_cb_, replica_fail_cb_);
|
||||
} catch (CoordinatorRegisterInstanceException const &) {
|
||||
return RegisterInstanceCoordinatorStatus::RPC_FAILED;
|
||||
}
|
||||
|
||||
if (res->get_result_code() != nuraft::cmd_result_code::OK) {
|
||||
spdlog::error("Failed to register instance {} with error code {}", instance_name, res->get_result_code());
|
||||
return RegisterInstanceCoordinatorStatus::RAFT_COULD_NOT_APPEND;
|
||||
}
|
||||
|
||||
spdlog::info("Instance {} registered", instance_name);
|
||||
return RegisterInstanceCoordinatorStatus::SUCCESS;
|
||||
}
|
||||
|
||||
auto CoordinatorInstance::UnregisterReplicationInstance(std::string instance_name)
|
||||
-> UnregisterInstanceCoordinatorStatus {
|
||||
auto lock = std::lock_guard{coord_instance_lock_};
|
||||
|
||||
auto const name_matches = [&instance_name](ReplicationInstance const &instance) {
|
||||
return instance.InstanceName() == instance_name;
|
||||
};
|
||||
|
||||
auto inst_to_remove = std::ranges::find_if(repl_instances_, name_matches);
|
||||
if (inst_to_remove == repl_instances_.end()) {
|
||||
return UnregisterInstanceCoordinatorStatus::NO_INSTANCE_WITH_NAME;
|
||||
}
|
||||
|
||||
if (inst_to_remove->IsMain() && inst_to_remove->IsAlive()) {
|
||||
return UnregisterInstanceCoordinatorStatus::IS_MAIN;
|
||||
}
|
||||
|
||||
inst_to_remove->StopFrequentCheck();
|
||||
auto curr_main = std::ranges::find_if(repl_instances_, &ReplicationInstance::IsMain);
|
||||
MG_ASSERT(curr_main != repl_instances_.end(), "There must be a main instance when unregistering a replica");
|
||||
if (!curr_main->SendUnregisterReplicaRpc(instance_name)) {
|
||||
inst_to_remove->StartFrequentCheck();
|
||||
return UnregisterInstanceCoordinatorStatus::RPC_FAILED;
|
||||
}
|
||||
std::erase_if(repl_instances_, name_matches);
|
||||
|
||||
return UnregisterInstanceCoordinatorStatus::SUCCESS;
|
||||
}
|
||||
|
||||
auto CoordinatorInstance::AddCoordinatorInstance(uint32_t raft_server_id, uint32_t raft_port, std::string raft_address)
|
||||
-> void {
|
||||
auto const endpoint = raft_address + ":" + std::to_string(raft_port);
|
||||
srv_config const srv_config_to_add(static_cast<int>(raft_server_id), endpoint);
|
||||
if (!raft_server_->add_srv(srv_config_to_add)->get_accepted()) {
|
||||
throw RaftAddServerException("Failed to add server {} to the cluster", endpoint);
|
||||
}
|
||||
spdlog::info("Request to add server {} to the cluster accepted", endpoint);
|
||||
raft_state_.AddCoordinatorInstance(raft_server_id, raft_port, std::move(raft_address));
|
||||
}
|
||||
|
||||
auto CoordinatorInstance::GetAllCoordinators() const -> std::vector<ptr<srv_config>> {
|
||||
std::vector<ptr<srv_config>> all_srv_configs;
|
||||
raft_server_->get_srv_config_all(all_srv_configs);
|
||||
return all_srv_configs;
|
||||
}
|
||||
auto CoordinatorInstance::GetMainUUID() const -> utils::UUID { return main_uuid_; }
|
||||
|
||||
// TODO: (andi) Add to the RAFT log.
|
||||
auto CoordinatorInstance::SetMainUUID(utils::UUID new_uuid) -> void { main_uuid_ = new_uuid; }
|
||||
|
||||
} // namespace memgraph::coordination
|
||||
#endif
|
||||
|
@ -13,214 +13,149 @@
|
||||
|
||||
#include "nuraft/coordinator_log_store.hpp"
|
||||
|
||||
#include "coordination/coordinator_exceptions.hpp"
|
||||
#include "utils/logging.hpp"
|
||||
|
||||
namespace memgraph::coordination {
|
||||
|
||||
using nuraft::cs_new;
|
||||
using nuraft::timer_helper;
|
||||
|
||||
CoordinatorLogStore::CoordinatorLogStore()
|
||||
: start_idx_(1),
|
||||
raft_server_bwd_pointer_(nullptr),
|
||||
disk_emul_delay(0),
|
||||
disk_emul_thread_(nullptr),
|
||||
disk_emul_thread_stop_signal_(false),
|
||||
disk_emul_last_durable_index_(0) {
|
||||
// Dummy entry for index 0.
|
||||
ptr<buffer> buf = buffer::alloc(sz_ulong);
|
||||
namespace {
|
||||
|
||||
ptr<log_entry> MakeClone(const ptr<log_entry> &entry) {
|
||||
return cs_new<log_entry>(entry->get_term(), buffer::clone(entry->get_buf()), entry->get_val_type(),
|
||||
entry->get_timestamp());
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
CoordinatorLogStore::CoordinatorLogStore() : start_idx_(1) {
|
||||
ptr<buffer> buf = buffer::alloc(sizeof(uint64_t));
|
||||
logs_[0] = cs_new<log_entry>(0, buf);
|
||||
}
|
||||
|
||||
CoordinatorLogStore::~CoordinatorLogStore() {
|
||||
if (disk_emul_thread_) {
|
||||
disk_emul_thread_stop_signal_ = true;
|
||||
// disk_emul_ea_.invoke();
|
||||
if (disk_emul_thread_->joinable()) {
|
||||
disk_emul_thread_->join();
|
||||
}
|
||||
}
|
||||
}
|
||||
CoordinatorLogStore::~CoordinatorLogStore() {}
|
||||
|
||||
ptr<log_entry> CoordinatorLogStore::MakeClone(const ptr<log_entry> &entry) {
|
||||
// NOTE:
|
||||
// Timestamp is used only when `replicate_log_timestamp_` option is on.
|
||||
// Otherwise, log store does not need to store or load it.
|
||||
ptr<log_entry> clone = cs_new<log_entry>(entry->get_term(), buffer::clone(entry->get_buf()), entry->get_val_type(),
|
||||
entry->get_timestamp());
|
||||
return clone;
|
||||
}
|
||||
|
||||
ulong CoordinatorLogStore::next_slot() const {
|
||||
std::lock_guard<std::mutex> l(logs_lock_);
|
||||
// Exclude the dummy entry.
|
||||
return start_idx_ + logs_.size() - 1;
|
||||
}
|
||||
|
||||
ulong CoordinatorLogStore::start_index() const { return start_idx_; }
|
||||
|
||||
ptr<log_entry> CoordinatorLogStore::last_entry() const {
|
||||
ulong next_idx = next_slot();
|
||||
std::lock_guard<std::mutex> l(logs_lock_);
|
||||
auto entry = logs_.find(next_idx - 1);
|
||||
auto CoordinatorLogStore::FindOrDefault_(uint64_t index) const -> ptr<log_entry> {
|
||||
auto entry = logs_.find(index);
|
||||
if (entry == logs_.end()) {
|
||||
entry = logs_.find(0);
|
||||
}
|
||||
|
||||
return MakeClone(entry->second);
|
||||
return entry->second;
|
||||
}
|
||||
|
||||
ulong CoordinatorLogStore::append(ptr<log_entry> &entry) {
|
||||
uint64_t CoordinatorLogStore::next_slot() const {
|
||||
auto lock = std::lock_guard{logs_lock_};
|
||||
return start_idx_ + logs_.size() - 1;
|
||||
}
|
||||
|
||||
uint64_t CoordinatorLogStore::start_index() const { return start_idx_; }
|
||||
|
||||
ptr<log_entry> CoordinatorLogStore::last_entry() const {
|
||||
auto lock = std::lock_guard{logs_lock_};
|
||||
|
||||
uint64_t const last_idx = start_idx_ + logs_.size() - 1;
|
||||
auto const last_src = FindOrDefault_(last_idx - 1);
|
||||
|
||||
return MakeClone(last_src);
|
||||
}
|
||||
|
||||
uint64_t CoordinatorLogStore::append(ptr<log_entry> &entry) {
|
||||
ptr<log_entry> clone = MakeClone(entry);
|
||||
|
||||
std::lock_guard<std::mutex> l(logs_lock_);
|
||||
size_t idx = start_idx_ + logs_.size() - 1;
|
||||
logs_[idx] = clone;
|
||||
|
||||
if (disk_emul_delay) {
|
||||
uint64_t cur_time = timer_helper::get_timeofday_us();
|
||||
disk_emul_logs_being_written_[cur_time + disk_emul_delay * 1000] = idx;
|
||||
// disk_emul_ea_.invoke();
|
||||
uint64_t next_slot{0};
|
||||
{
|
||||
auto lock = std::lock_guard{logs_lock_};
|
||||
next_slot = start_idx_ + logs_.size() - 1;
|
||||
logs_[next_slot] = clone;
|
||||
}
|
||||
|
||||
return idx;
|
||||
return next_slot;
|
||||
}
|
||||
|
||||
void CoordinatorLogStore::write_at(ulong index, ptr<log_entry> &entry) {
|
||||
void CoordinatorLogStore::write_at(uint64_t index, ptr<log_entry> &entry) {
|
||||
ptr<log_entry> clone = MakeClone(entry);
|
||||
|
||||
// Discard all logs equal to or greater than `index.
|
||||
std::lock_guard<std::mutex> l(logs_lock_);
|
||||
auto itr = logs_.lower_bound(index);
|
||||
while (itr != logs_.end()) {
|
||||
itr = logs_.erase(itr);
|
||||
}
|
||||
logs_[index] = clone;
|
||||
|
||||
if (disk_emul_delay) {
|
||||
uint64_t cur_time = timer_helper::get_timeofday_us();
|
||||
disk_emul_logs_being_written_[cur_time + disk_emul_delay * 1000] = index;
|
||||
|
||||
// Remove entries greater than `index`.
|
||||
auto entry = disk_emul_logs_being_written_.begin();
|
||||
while (entry != disk_emul_logs_being_written_.end()) {
|
||||
if (entry->second > index) {
|
||||
entry = disk_emul_logs_being_written_.erase(entry);
|
||||
} else {
|
||||
entry++;
|
||||
}
|
||||
{
|
||||
auto lock = std::lock_guard{logs_lock_};
|
||||
auto itr = logs_.lower_bound(index);
|
||||
while (itr != logs_.end()) {
|
||||
itr = logs_.erase(itr);
|
||||
}
|
||||
// disk_emul_ea_.invoke();
|
||||
logs_[index] = clone;
|
||||
}
|
||||
}
|
||||
|
||||
ptr<std::vector<ptr<log_entry>>> CoordinatorLogStore::log_entries(ulong start, ulong end) {
|
||||
ptr<std::vector<ptr<log_entry>>> ret = cs_new<std::vector<ptr<log_entry>>>();
|
||||
|
||||
ptr<std::vector<ptr<log_entry>>> CoordinatorLogStore::log_entries(uint64_t start, uint64_t end) {
|
||||
auto ret = cs_new<std::vector<ptr<log_entry>>>();
|
||||
ret->resize(end - start);
|
||||
ulong cc = 0;
|
||||
for (ulong ii = start; ii < end; ++ii) {
|
||||
|
||||
for (uint64_t i = start, curr_index = 0; i < end; ++i, ++curr_index) {
|
||||
ptr<log_entry> src = nullptr;
|
||||
{
|
||||
std::lock_guard<std::mutex> l(logs_lock_);
|
||||
auto entry = logs_.find(ii);
|
||||
if (entry == logs_.end()) {
|
||||
entry = logs_.find(0);
|
||||
assert(0);
|
||||
auto lock = std::lock_guard{logs_lock_};
|
||||
if (auto const entry = logs_.find(i); entry != logs_.end()) {
|
||||
src = entry->second;
|
||||
} else {
|
||||
throw RaftCouldNotFindEntryException("Could not find entry at index {}", i);
|
||||
}
|
||||
src = entry->second;
|
||||
}
|
||||
(*ret)[cc++] = MakeClone(src);
|
||||
(*ret)[curr_index] = MakeClone(src);
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
// NOLINTNEXTLINE(google-default-arguments)
|
||||
ptr<std::vector<ptr<log_entry>>> CoordinatorLogStore::log_entries_ext(ulong start, ulong end,
|
||||
int64 batch_size_hint_in_bytes) {
|
||||
ptr<std::vector<ptr<log_entry>>> ret = cs_new<std::vector<ptr<log_entry>>>();
|
||||
|
||||
if (batch_size_hint_in_bytes < 0) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
size_t accum_size = 0;
|
||||
for (ulong ii = start; ii < end; ++ii) {
|
||||
ptr<log_entry> src = nullptr;
|
||||
{
|
||||
std::lock_guard<std::mutex> l(logs_lock_);
|
||||
auto entry = logs_.find(ii);
|
||||
if (entry == logs_.end()) {
|
||||
entry = logs_.find(0);
|
||||
assert(0);
|
||||
}
|
||||
src = entry->second;
|
||||
}
|
||||
ret->push_back(MakeClone(src));
|
||||
accum_size += src->get_buf().size();
|
||||
if (batch_size_hint_in_bytes && accum_size >= (ulong)batch_size_hint_in_bytes) break;
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
ptr<log_entry> CoordinatorLogStore::entry_at(ulong index) {
|
||||
ptr<log_entry> CoordinatorLogStore::entry_at(uint64_t index) {
|
||||
ptr<log_entry> src = nullptr;
|
||||
{
|
||||
std::lock_guard<std::mutex> l(logs_lock_);
|
||||
auto entry = logs_.find(index);
|
||||
if (entry == logs_.end()) {
|
||||
entry = logs_.find(0);
|
||||
}
|
||||
src = entry->second;
|
||||
auto lock = std::lock_guard{logs_lock_};
|
||||
src = FindOrDefault_(index);
|
||||
}
|
||||
return MakeClone(src);
|
||||
}
|
||||
|
||||
ulong CoordinatorLogStore::term_at(ulong index) {
|
||||
ulong term = 0;
|
||||
uint64_t CoordinatorLogStore::term_at(uint64_t index) {
|
||||
uint64_t term = 0;
|
||||
{
|
||||
std::lock_guard<std::mutex> l(logs_lock_);
|
||||
auto entry = logs_.find(index);
|
||||
if (entry == logs_.end()) {
|
||||
entry = logs_.find(0);
|
||||
}
|
||||
term = entry->second->get_term();
|
||||
auto lock = std::lock_guard{logs_lock_};
|
||||
term = FindOrDefault_(index)->get_term();
|
||||
}
|
||||
return term;
|
||||
}
|
||||
|
||||
ptr<buffer> CoordinatorLogStore::pack(ulong index, int32 cnt) {
|
||||
ptr<buffer> CoordinatorLogStore::pack(uint64_t index, int32 cnt) {
|
||||
std::vector<ptr<buffer>> logs;
|
||||
|
||||
size_t size_total = 0;
|
||||
for (ulong ii = index; ii < index + cnt; ++ii) {
|
||||
uint64_t const end_index = index + cnt;
|
||||
for (uint64_t i = index; i < end_index; ++i) {
|
||||
ptr<log_entry> le = nullptr;
|
||||
{
|
||||
std::lock_guard<std::mutex> l(logs_lock_);
|
||||
le = logs_[ii];
|
||||
auto lock = std::lock_guard{logs_lock_};
|
||||
le = logs_[i];
|
||||
}
|
||||
assert(le.get());
|
||||
ptr<buffer> buf = le->serialize();
|
||||
MG_ASSERT(le.get(), "Could not find log entry at index {}", i);
|
||||
auto buf = le->serialize();
|
||||
size_total += buf->size();
|
||||
logs.push_back(buf);
|
||||
}
|
||||
|
||||
ptr<buffer> buf_out = buffer::alloc(sizeof(int32) + cnt * sizeof(int32) + size_total);
|
||||
auto buf_out = buffer::alloc(sizeof(int32) + cnt * sizeof(int32) + size_total);
|
||||
buf_out->pos(0);
|
||||
buf_out->put((int32)cnt);
|
||||
|
||||
for (auto &entry : logs) {
|
||||
ptr<buffer> &bb = entry;
|
||||
buf_out->put((int32)bb->size());
|
||||
buf_out->put(*bb);
|
||||
buf_out->put(static_cast<int32>(entry->size()));
|
||||
buf_out->put(*entry);
|
||||
}
|
||||
return buf_out;
|
||||
}
|
||||
|
||||
void CoordinatorLogStore::apply_pack(ulong index, buffer &pack) {
|
||||
void CoordinatorLogStore::apply_pack(uint64_t index, buffer &pack) {
|
||||
pack.pos(0);
|
||||
int32 num_logs = pack.get_int();
|
||||
int32 const num_logs = pack.get_int();
|
||||
|
||||
for (int32 ii = 0; ii < num_logs; ++ii) {
|
||||
ulong cur_idx = index + ii;
|
||||
for (int32 i = 0; i < num_logs; ++i) {
|
||||
uint64_t cur_idx = index + i;
|
||||
int32 buf_size = pack.get_int();
|
||||
|
||||
ptr<buffer> buf_local = buffer::alloc(buf_size);
|
||||
@ -228,14 +163,14 @@ void CoordinatorLogStore::apply_pack(ulong index, buffer &pack) {
|
||||
|
||||
ptr<log_entry> le = log_entry::deserialize(*buf_local);
|
||||
{
|
||||
std::lock_guard<std::mutex> l(logs_lock_);
|
||||
auto lock = std::lock_guard{logs_lock_};
|
||||
logs_[cur_idx] = le;
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
std::lock_guard<std::mutex> l(logs_lock_);
|
||||
auto entry = logs_.upper_bound(0);
|
||||
auto lock = std::lock_guard{logs_lock_};
|
||||
auto const entry = logs_.upper_bound(0);
|
||||
if (entry != logs_.end()) {
|
||||
start_idx_ = entry->first;
|
||||
} else {
|
||||
@ -244,88 +179,23 @@ void CoordinatorLogStore::apply_pack(ulong index, buffer &pack) {
|
||||
}
|
||||
}
|
||||
|
||||
bool CoordinatorLogStore::compact(ulong last_log_index) {
|
||||
std::lock_guard<std::mutex> l(logs_lock_);
|
||||
for (ulong ii = start_idx_; ii <= last_log_index; ++ii) {
|
||||
auto entry = logs_.find(ii);
|
||||
// NOTE: Remove all logs up to given 'last_log_index' (inclusive).
|
||||
bool CoordinatorLogStore::compact(uint64_t last_log_index) {
|
||||
auto lock = std::lock_guard{logs_lock_};
|
||||
for (uint64_t ii = start_idx_; ii <= last_log_index; ++ii) {
|
||||
auto const entry = logs_.find(ii);
|
||||
if (entry != logs_.end()) {
|
||||
logs_.erase(entry);
|
||||
}
|
||||
}
|
||||
|
||||
// WARNING:
|
||||
// Even though nothing has been erased,
|
||||
// we should set `start_idx_` to new index.
|
||||
if (start_idx_ <= last_log_index) {
|
||||
start_idx_ = last_log_index + 1;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
bool CoordinatorLogStore::flush() {
|
||||
disk_emul_last_durable_index_ = next_slot() - 1;
|
||||
return true;
|
||||
}
|
||||
|
||||
ulong CoordinatorLogStore::last_durable_index() {
|
||||
uint64_t last_log = next_slot() - 1;
|
||||
if (!disk_emul_delay) {
|
||||
return last_log;
|
||||
}
|
||||
|
||||
return disk_emul_last_durable_index_;
|
||||
}
|
||||
|
||||
void CoordinatorLogStore::DiskEmulLoop() {
|
||||
// This thread mimics async disk writes.
|
||||
|
||||
// uint32_t next_sleep_us = 100 * 1000;
|
||||
while (!disk_emul_thread_stop_signal_) {
|
||||
// disk_emul_ea_.wait_us(next_sleep_us);
|
||||
// disk_emul_ea_.reset();
|
||||
if (disk_emul_thread_stop_signal_) break;
|
||||
|
||||
uint64_t cur_time = timer_helper::get_timeofday_us();
|
||||
// next_sleep_us = 100 * 1000;
|
||||
|
||||
bool call_notification = false;
|
||||
{
|
||||
std::lock_guard<std::mutex> l(logs_lock_);
|
||||
// Remove all timestamps equal to or smaller than `cur_time`,
|
||||
// and pick the greatest one among them.
|
||||
auto entry = disk_emul_logs_being_written_.begin();
|
||||
while (entry != disk_emul_logs_being_written_.end()) {
|
||||
if (entry->first <= cur_time) {
|
||||
disk_emul_last_durable_index_ = entry->second;
|
||||
entry = disk_emul_logs_being_written_.erase(entry);
|
||||
call_notification = true;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
entry = disk_emul_logs_being_written_.begin();
|
||||
if (entry != disk_emul_logs_being_written_.end()) {
|
||||
// next_sleep_us = entry->first - cur_time;
|
||||
}
|
||||
}
|
||||
|
||||
if (call_notification) {
|
||||
raft_server_bwd_pointer_->notify_log_append_completion(true);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void CoordinatorLogStore::Close() {}
|
||||
|
||||
void CoordinatorLogStore::SetDiskDelay(raft_server *raft, size_t delay_ms) {
|
||||
disk_emul_delay = delay_ms;
|
||||
raft_server_bwd_pointer_ = raft;
|
||||
|
||||
if (!disk_emul_thread_) {
|
||||
disk_emul_thread_ = std::make_unique<std::thread>(&CoordinatorLogStore::DiskEmulLoop, this);
|
||||
}
|
||||
}
|
||||
bool CoordinatorLogStore::flush() { return true; }
|
||||
|
||||
} // namespace memgraph::coordination
|
||||
#endif
|
||||
|
@ -52,6 +52,51 @@ void DemoteMainToReplicaRes::Load(DemoteMainToReplicaRes *self, memgraph::slk::R
|
||||
memgraph::slk::Load(self, reader);
|
||||
}
|
||||
|
||||
void UnregisterReplicaReq::Save(UnregisterReplicaReq const &self, memgraph::slk::Builder *builder) {
|
||||
memgraph::slk::Save(self, builder);
|
||||
}
|
||||
|
||||
void UnregisterReplicaReq::Load(UnregisterReplicaReq *self, memgraph::slk::Reader *reader) {
|
||||
memgraph::slk::Load(self, reader);
|
||||
}
|
||||
|
||||
void UnregisterReplicaRes::Save(UnregisterReplicaRes const &self, memgraph::slk::Builder *builder) {
|
||||
memgraph::slk::Save(self, builder);
|
||||
}
|
||||
|
||||
void UnregisterReplicaRes::Load(UnregisterReplicaRes *self, memgraph::slk::Reader *reader) {
|
||||
memgraph::slk::Load(self, reader);
|
||||
}
|
||||
|
||||
void EnableWritingOnMainRes::Save(EnableWritingOnMainRes const &self, memgraph::slk::Builder *builder) {
|
||||
memgraph::slk::Save(self, builder);
|
||||
}
|
||||
|
||||
void EnableWritingOnMainRes::Load(EnableWritingOnMainRes *self, memgraph::slk::Reader *reader) {
|
||||
memgraph::slk::Load(self, reader);
|
||||
}
|
||||
|
||||
void EnableWritingOnMainReq::Save(EnableWritingOnMainReq const &self, memgraph::slk::Builder *builder) {}
|
||||
|
||||
void EnableWritingOnMainReq::Load(EnableWritingOnMainReq *self, memgraph::slk::Reader *reader) {}
|
||||
|
||||
// GetInstanceUUID
|
||||
void GetInstanceUUIDReq::Save(const GetInstanceUUIDReq &self, memgraph::slk::Builder *builder) {
|
||||
memgraph::slk::Save(self, builder);
|
||||
}
|
||||
|
||||
void GetInstanceUUIDReq::Load(GetInstanceUUIDReq *self, memgraph::slk::Reader *reader) {
|
||||
memgraph::slk::Load(self, reader);
|
||||
}
|
||||
|
||||
void GetInstanceUUIDRes::Save(const GetInstanceUUIDRes &self, memgraph::slk::Builder *builder) {
|
||||
memgraph::slk::Save(self, builder);
|
||||
}
|
||||
|
||||
void GetInstanceUUIDRes::Load(GetInstanceUUIDRes *self, memgraph::slk::Reader *reader) {
|
||||
memgraph::slk::Load(self, reader);
|
||||
}
|
||||
|
||||
} // namespace coordination
|
||||
|
||||
constexpr utils::TypeInfo coordination::PromoteReplicaToMainReq::kType{utils::TypeId::COORD_FAILOVER_REQ,
|
||||
@ -64,10 +109,31 @@ constexpr utils::TypeInfo coordination::DemoteMainToReplicaReq::kType{utils::Typ
|
||||
"CoordDemoteToReplicaReq", nullptr};
|
||||
|
||||
constexpr utils::TypeInfo coordination::DemoteMainToReplicaRes::kType{utils::TypeId::COORD_SET_REPL_MAIN_RES,
|
||||
|
||||
"CoordDemoteToReplicaRes", nullptr};
|
||||
|
||||
constexpr utils::TypeInfo coordination::UnregisterReplicaReq::kType{utils::TypeId::COORD_UNREGISTER_REPLICA_REQ,
|
||||
"UnregisterReplicaReq", nullptr};
|
||||
|
||||
constexpr utils::TypeInfo coordination::UnregisterReplicaRes::kType{utils::TypeId::COORD_UNREGISTER_REPLICA_RES,
|
||||
"UnregisterReplicaRes", nullptr};
|
||||
|
||||
constexpr utils::TypeInfo coordination::EnableWritingOnMainReq::kType{utils::TypeId::COORD_ENABLE_WRITING_ON_MAIN_REQ,
|
||||
"CoordEnableWritingOnMainReq", nullptr};
|
||||
|
||||
constexpr utils::TypeInfo coordination::EnableWritingOnMainRes::kType{utils::TypeId::COORD_ENABLE_WRITING_ON_MAIN_RES,
|
||||
"CoordEnableWritingOnMainRes", nullptr};
|
||||
|
||||
constexpr utils::TypeInfo coordination::GetInstanceUUIDReq::kType{utils::TypeId::COORD_GET_UUID_REQ, "CoordGetUUIDReq",
|
||||
nullptr};
|
||||
|
||||
constexpr utils::TypeInfo coordination::GetInstanceUUIDRes::kType{utils::TypeId::COORD_GET_UUID_RES, "CoordGetUUIDRes",
|
||||
nullptr};
|
||||
|
||||
namespace slk {
|
||||
|
||||
// PromoteReplicaToMainRpc
|
||||
|
||||
void Save(const memgraph::coordination::PromoteReplicaToMainRes &self, memgraph::slk::Builder *builder) {
|
||||
memgraph::slk::Save(self.success, builder);
|
||||
}
|
||||
@ -86,6 +152,7 @@ void Load(memgraph::coordination::PromoteReplicaToMainReq *self, memgraph::slk::
|
||||
memgraph::slk::Load(&self->replication_clients_info, reader);
|
||||
}
|
||||
|
||||
// DemoteMainToReplicaRpc
|
||||
void Save(const memgraph::coordination::DemoteMainToReplicaReq &self, memgraph::slk::Builder *builder) {
|
||||
memgraph::slk::Save(self.replication_client_info, builder);
|
||||
}
|
||||
@ -102,6 +169,50 @@ void Load(memgraph::coordination::DemoteMainToReplicaRes *self, memgraph::slk::R
|
||||
memgraph::slk::Load(&self->success, reader);
|
||||
}
|
||||
|
||||
// UnregisterReplicaRpc
|
||||
|
||||
void Save(memgraph::coordination::UnregisterReplicaReq const &self, memgraph::slk::Builder *builder) {
|
||||
memgraph::slk::Save(self.instance_name, builder);
|
||||
}
|
||||
|
||||
void Load(memgraph::coordination::UnregisterReplicaReq *self, memgraph::slk::Reader *reader) {
|
||||
memgraph::slk::Load(&self->instance_name, reader);
|
||||
}
|
||||
|
||||
void Save(memgraph::coordination::UnregisterReplicaRes const &self, memgraph::slk::Builder *builder) {
|
||||
memgraph::slk::Save(self.success, builder);
|
||||
}
|
||||
|
||||
void Load(memgraph::coordination::UnregisterReplicaRes *self, memgraph::slk::Reader *reader) {
|
||||
memgraph::slk::Load(&self->success, reader);
|
||||
}
|
||||
|
||||
void Save(memgraph::coordination::EnableWritingOnMainRes const &self, memgraph::slk::Builder *builder) {
|
||||
memgraph::slk::Save(self.success, builder);
|
||||
}
|
||||
|
||||
void Load(memgraph::coordination::EnableWritingOnMainRes *self, memgraph::slk::Reader *reader) {
|
||||
memgraph::slk::Load(&self->success, reader);
|
||||
}
|
||||
|
||||
// GetInstanceUUIDRpc
|
||||
|
||||
void Save(const memgraph::coordination::GetInstanceUUIDReq & /*self*/, memgraph::slk::Builder * /*builder*/) {
|
||||
/* nothing to serialize*/
|
||||
}
|
||||
|
||||
void Load(memgraph::coordination::GetInstanceUUIDReq * /*self*/, memgraph::slk::Reader * /*reader*/) {
|
||||
/* nothing to serialize*/
|
||||
}
|
||||
|
||||
void Save(const memgraph::coordination::GetInstanceUUIDRes &self, memgraph::slk::Builder *builder) {
|
||||
memgraph::slk::Save(self.uuid, builder);
|
||||
}
|
||||
|
||||
void Load(memgraph::coordination::GetInstanceUUIDRes *self, memgraph::slk::Reader *reader) {
|
||||
memgraph::slk::Load(&self->uuid, reader);
|
||||
}
|
||||
|
||||
} // namespace slk
|
||||
|
||||
} // namespace memgraph
|
||||
|
@ -41,37 +41,53 @@ CoordinatorState::CoordinatorState() {
|
||||
}
|
||||
}
|
||||
|
||||
auto CoordinatorState::RegisterInstance(CoordinatorClientConfig config) -> RegisterInstanceCoordinatorStatus {
|
||||
MG_ASSERT(std::holds_alternative<CoordinatorData>(data_),
|
||||
auto CoordinatorState::RegisterReplicationInstance(CoordinatorClientConfig config)
|
||||
-> RegisterInstanceCoordinatorStatus {
|
||||
MG_ASSERT(std::holds_alternative<CoordinatorInstance>(data_),
|
||||
"Coordinator cannot register replica since variant holds wrong alternative");
|
||||
|
||||
return std::visit(
|
||||
memgraph::utils::Overloaded{
|
||||
[](const CoordinatorMainReplicaData & /*coordinator_main_replica_data*/) {
|
||||
return RegisterInstanceCoordinatorStatus::NOT_COORDINATOR;
|
||||
},
|
||||
[config](CoordinatorData &coordinator_data) { return coordinator_data.RegisterInstance(config); }},
|
||||
memgraph::utils::Overloaded{[](const CoordinatorMainReplicaData & /*coordinator_main_replica_data*/) {
|
||||
return RegisterInstanceCoordinatorStatus::NOT_COORDINATOR;
|
||||
},
|
||||
[config](CoordinatorInstance &coordinator_instance) {
|
||||
return coordinator_instance.RegisterReplicationInstance(config);
|
||||
}},
|
||||
data_);
|
||||
}
|
||||
|
||||
auto CoordinatorState::SetInstanceToMain(std::string instance_name) -> SetInstanceToMainCoordinatorStatus {
|
||||
MG_ASSERT(std::holds_alternative<CoordinatorData>(data_),
|
||||
auto CoordinatorState::UnregisterReplicationInstance(std::string instance_name) -> UnregisterInstanceCoordinatorStatus {
|
||||
MG_ASSERT(std::holds_alternative<CoordinatorInstance>(data_),
|
||||
"Coordinator cannot unregister instance since variant holds wrong alternative");
|
||||
|
||||
return std::visit(
|
||||
memgraph::utils::Overloaded{[](const CoordinatorMainReplicaData & /*coordinator_main_replica_data*/) {
|
||||
return UnregisterInstanceCoordinatorStatus::NOT_COORDINATOR;
|
||||
},
|
||||
[&instance_name](CoordinatorInstance &coordinator_instance) {
|
||||
return coordinator_instance.UnregisterReplicationInstance(instance_name);
|
||||
}},
|
||||
data_);
|
||||
}
|
||||
|
||||
auto CoordinatorState::SetReplicationInstanceToMain(std::string instance_name) -> SetInstanceToMainCoordinatorStatus {
|
||||
MG_ASSERT(std::holds_alternative<CoordinatorInstance>(data_),
|
||||
"Coordinator cannot register replica since variant holds wrong alternative");
|
||||
|
||||
return std::visit(
|
||||
memgraph::utils::Overloaded{[](const CoordinatorMainReplicaData & /*coordinator_main_replica_data*/) {
|
||||
return SetInstanceToMainCoordinatorStatus::NOT_COORDINATOR;
|
||||
},
|
||||
[&instance_name](CoordinatorData &coordinator_data) {
|
||||
return coordinator_data.SetInstanceToMain(instance_name);
|
||||
[&instance_name](CoordinatorInstance &coordinator_instance) {
|
||||
return coordinator_instance.SetReplicationInstanceToMain(instance_name);
|
||||
}},
|
||||
data_);
|
||||
}
|
||||
|
||||
auto CoordinatorState::ShowInstances() const -> std::vector<InstanceStatus> {
|
||||
MG_ASSERT(std::holds_alternative<CoordinatorData>(data_),
|
||||
MG_ASSERT(std::holds_alternative<CoordinatorInstance>(data_),
|
||||
"Can't call show instances on data_, as variant holds wrong alternative");
|
||||
return std::get<CoordinatorData>(data_).ShowInstances();
|
||||
return std::get<CoordinatorInstance>(data_).ShowInstances();
|
||||
}
|
||||
|
||||
auto CoordinatorState::GetCoordinatorServer() const -> CoordinatorServer & {
|
||||
@ -82,9 +98,9 @@ auto CoordinatorState::GetCoordinatorServer() const -> CoordinatorServer & {
|
||||
|
||||
auto CoordinatorState::AddCoordinatorInstance(uint32_t raft_server_id, uint32_t raft_port, std::string raft_address)
|
||||
-> void {
|
||||
MG_ASSERT(std::holds_alternative<CoordinatorData>(data_),
|
||||
MG_ASSERT(std::holds_alternative<CoordinatorInstance>(data_),
|
||||
"Coordinator cannot register replica since variant holds wrong alternative");
|
||||
return std::get<CoordinatorData>(data_).AddCoordinatorInstance(raft_server_id, raft_port, raft_address);
|
||||
return std::get<CoordinatorInstance>(data_).AddCoordinatorInstance(raft_server_id, raft_port, raft_address);
|
||||
}
|
||||
|
||||
} // namespace memgraph::coordination
|
||||
|
@ -15,6 +15,19 @@
|
||||
|
||||
namespace memgraph::coordination {
|
||||
|
||||
auto CoordinatorStateMachine::EncodeRegisterReplicationInstance(const std::string &name) -> ptr<buffer> {
|
||||
std::string str_log = name + "_replica";
|
||||
ptr<buffer> log = buffer::alloc(sizeof(uint32_t) + str_log.size());
|
||||
buffer_serializer bs(log);
|
||||
bs.put_str(str_log);
|
||||
return log;
|
||||
}
|
||||
|
||||
auto CoordinatorStateMachine::DecodeRegisterReplicationInstance(buffer &data) -> std::string {
|
||||
buffer_serializer bs(data);
|
||||
return bs.get_str();
|
||||
}
|
||||
|
||||
auto CoordinatorStateMachine::pre_commit(ulong const log_idx, buffer &data) -> ptr<buffer> {
|
||||
buffer_serializer bs(data);
|
||||
std::string str = bs.get_str();
|
||||
|
@ -11,23 +11,25 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "utils/uuid.hpp"
|
||||
#ifdef MG_ENTERPRISE
|
||||
|
||||
#include "coordination/coordinator_config.hpp"
|
||||
#include "rpc/client.hpp"
|
||||
#include "rpc_errors.hpp"
|
||||
#include "utils/result.hpp"
|
||||
#include "utils/scheduler.hpp"
|
||||
#include "utils/uuid.hpp"
|
||||
|
||||
namespace memgraph::coordination {
|
||||
|
||||
class CoordinatorData;
|
||||
using HealthCheckCallback = std::function<void(CoordinatorData *, std::string_view)>;
|
||||
class CoordinatorInstance;
|
||||
using HealthCheckCallback = std::function<void(CoordinatorInstance *, std::string_view)>;
|
||||
using ReplicationClientsInfo = std::vector<ReplClientInfo>;
|
||||
|
||||
class CoordinatorClient {
|
||||
public:
|
||||
explicit CoordinatorClient(CoordinatorData *coord_data_, CoordinatorClientConfig config, HealthCheckCallback succ_cb,
|
||||
HealthCheckCallback fail_cb);
|
||||
explicit CoordinatorClient(CoordinatorInstance *coord_instance, CoordinatorClientConfig config,
|
||||
HealthCheckCallback succ_cb, HealthCheckCallback fail_cb);
|
||||
|
||||
~CoordinatorClient() = default;
|
||||
|
||||
@ -46,17 +48,28 @@ class CoordinatorClient {
|
||||
auto SocketAddress() const -> std::string;
|
||||
|
||||
[[nodiscard]] auto DemoteToReplica() const -> bool;
|
||||
|
||||
auto SendPromoteReplicaToMainRpc(const utils::UUID &uuid, ReplicationClientsInfo replication_clients_info) const
|
||||
-> bool;
|
||||
|
||||
auto SendSwapMainUUIDRpc(const utils::UUID &uuid) const -> bool;
|
||||
|
||||
auto SendUnregisterReplicaRpc(std::string const &instance_name) const -> bool;
|
||||
|
||||
auto SendEnableWritingOnMainRpc() const -> bool;
|
||||
|
||||
auto SendGetInstanceUUIDRpc() const -> memgraph::utils::BasicResult<GetInstanceUUIDError, std::optional<utils::UUID>>;
|
||||
|
||||
auto ReplicationClientInfo() const -> ReplClientInfo;
|
||||
|
||||
auto SetCallbacks(HealthCheckCallback succ_cb, HealthCheckCallback fail_cb) -> void;
|
||||
|
||||
auto RpcClient() -> rpc::Client & { return rpc_client_; }
|
||||
|
||||
auto InstanceDownTimeoutSec() const -> std::chrono::seconds;
|
||||
|
||||
auto InstanceGetUUIDFrequencySec() const -> std::chrono::seconds;
|
||||
|
||||
friend bool operator==(CoordinatorClient const &first, CoordinatorClient const &second) {
|
||||
return first.config_ == second.config_;
|
||||
}
|
||||
@ -64,12 +77,11 @@ class CoordinatorClient {
|
||||
private:
|
||||
utils::Scheduler instance_checker_;
|
||||
|
||||
// TODO: (andi) Pimpl?
|
||||
communication::ClientContext rpc_context_;
|
||||
mutable rpc::Client rpc_client_;
|
||||
|
||||
CoordinatorClientConfig config_;
|
||||
CoordinatorData *coord_data_;
|
||||
CoordinatorInstance *coord_instance_;
|
||||
HealthCheckCallback succ_cb_;
|
||||
HealthCheckCallback fail_cb_;
|
||||
};
|
||||
|
@ -28,7 +28,9 @@ struct CoordinatorClientConfig {
|
||||
std::string instance_name;
|
||||
std::string ip_address;
|
||||
uint16_t port{};
|
||||
std::chrono::seconds health_check_frequency_sec{1};
|
||||
std::chrono::seconds instance_health_check_frequency_sec{1};
|
||||
std::chrono::seconds instance_down_timeout_sec{5};
|
||||
std::chrono::seconds instance_get_uuid_frequency_sec{10};
|
||||
|
||||
auto SocketAddress() const -> std::string { return ip_address + ":" + std::to_string(port); }
|
||||
|
||||
|
@ -1,61 +0,0 @@
|
||||
// Copyright 2024 Memgraph Ltd.
|
||||
//
|
||||
// Use of this software is governed by the Business Source License
|
||||
// included in the file licenses/BSL.txt; by using this file, you agree to be bound by the terms of the Business Source
|
||||
// License, and you may not use this file except in compliance with the Business Source License.
|
||||
//
|
||||
// As of the Change Date specified in that file, in accordance with
|
||||
// the Business Source License, use of this software will be governed
|
||||
// by the Apache License, Version 2.0, included in the file
|
||||
// licenses/APL.txt.
|
||||
|
||||
#pragma once
|
||||
|
||||
#ifdef MG_ENTERPRISE
|
||||
|
||||
#include "coordination/coordinator_instance.hpp"
|
||||
#include "coordination/coordinator_server.hpp"
|
||||
#include "coordination/instance_status.hpp"
|
||||
#include "coordination/register_main_replica_coordinator_status.hpp"
|
||||
#include "coordination/replication_instance.hpp"
|
||||
#include "replication_coordination_glue/handler.hpp"
|
||||
#include "utils/rw_lock.hpp"
|
||||
#include "utils/thread_pool.hpp"
|
||||
#include "utils/uuid.hpp"
|
||||
|
||||
#include <list>
|
||||
|
||||
namespace memgraph::coordination {
|
||||
class CoordinatorData {
|
||||
public:
|
||||
CoordinatorData();
|
||||
|
||||
// TODO: (andi) Probably rename to RegisterReplicationInstance
|
||||
[[nodiscard]] auto RegisterInstance(CoordinatorClientConfig config) -> RegisterInstanceCoordinatorStatus;
|
||||
|
||||
[[nodiscard]] auto SetInstanceToMain(std::string instance_name) -> SetInstanceToMainCoordinatorStatus;
|
||||
|
||||
auto ShowInstances() const -> std::vector<InstanceStatus>;
|
||||
|
||||
auto TryFailover() -> void;
|
||||
|
||||
auto AddCoordinatorInstance(uint32_t raft_server_id, uint32_t raft_port, std::string raft_address) -> void;
|
||||
|
||||
private:
|
||||
HealthCheckCallback main_succ_cb_, main_fail_cb_, replica_succ_cb_, replica_fail_cb_;
|
||||
|
||||
// NOTE: Must be std::list because we rely on pointer stability
|
||||
std::list<ReplicationInstance> repl_instances_;
|
||||
mutable utils::RWLock coord_data_lock_{utils::RWLock::Priority::READ};
|
||||
|
||||
CoordinatorInstance self_;
|
||||
|
||||
utils::UUID main_uuid_;
|
||||
};
|
||||
|
||||
struct CoordinatorMainReplicaData {
|
||||
std::unique_ptr<CoordinatorServer> coordinator_server_;
|
||||
};
|
||||
|
||||
} // namespace memgraph::coordination
|
||||
#endif
|
@ -50,5 +50,38 @@ class RaftAddServerException final : public utils::BasicException {
|
||||
SPECIALIZE_GET_EXCEPTION_NAME(RaftAddServerException)
|
||||
};
|
||||
|
||||
class RaftBecomeLeaderException final : public utils::BasicException {
|
||||
public:
|
||||
explicit RaftBecomeLeaderException(std::string_view what) noexcept : BasicException(what) {}
|
||||
|
||||
template <class... Args>
|
||||
explicit RaftBecomeLeaderException(fmt::format_string<Args...> fmt, Args &&...args) noexcept
|
||||
: RaftBecomeLeaderException(fmt::format(fmt, std::forward<Args>(args)...)) {}
|
||||
|
||||
SPECIALIZE_GET_EXCEPTION_NAME(RaftBecomeLeaderException)
|
||||
};
|
||||
|
||||
class RaftCouldNotFindEntryException final : public utils::BasicException {
|
||||
public:
|
||||
explicit RaftCouldNotFindEntryException(std::string_view what) noexcept : BasicException(what) {}
|
||||
|
||||
template <class... Args>
|
||||
explicit RaftCouldNotFindEntryException(fmt::format_string<Args...> fmt, Args &&...args) noexcept
|
||||
: RaftCouldNotFindEntryException(fmt::format(fmt, std::forward<Args>(args)...)) {}
|
||||
|
||||
SPECIALIZE_GET_EXCEPTION_NAME(RaftCouldNotFindEntryException)
|
||||
};
|
||||
|
||||
class RaftCouldNotParseFlagsException final : public utils::BasicException {
|
||||
public:
|
||||
explicit RaftCouldNotParseFlagsException(std::string_view what) noexcept : BasicException(what) {}
|
||||
|
||||
template <class... Args>
|
||||
explicit RaftCouldNotParseFlagsException(fmt::format_string<Args...> fmt, Args &&...args) noexcept
|
||||
: RaftCouldNotParseFlagsException(fmt::format(fmt, std::forward<Args>(args)...)) {}
|
||||
|
||||
SPECIALIZE_GET_EXCEPTION_NAME(RaftCouldNotParseFlagsException)
|
||||
};
|
||||
|
||||
} // namespace memgraph::coordination
|
||||
#endif
|
||||
|
@ -33,6 +33,14 @@ class CoordinatorHandlers {
|
||||
slk::Builder *res_builder);
|
||||
static void SwapMainUUIDHandler(replication::ReplicationHandler &replication_handler, slk::Reader *req_reader,
|
||||
slk::Builder *res_builder);
|
||||
|
||||
static void UnregisterReplicaHandler(replication::ReplicationHandler &replication_handler, slk::Reader *req_reader,
|
||||
slk::Builder *res_builder);
|
||||
static void EnableWritingOnMainHandler(replication::ReplicationHandler &replication_handler, slk::Reader *req_reader,
|
||||
slk::Builder *res_builder);
|
||||
|
||||
static void GetInstanceUUIDHandler(replication::ReplicationHandler &replication_handler, slk::Reader *req_reader,
|
||||
slk::Builder *res_builder);
|
||||
};
|
||||
|
||||
} // namespace memgraph::dbms
|
||||
|
@ -13,45 +13,47 @@
|
||||
|
||||
#ifdef MG_ENTERPRISE
|
||||
|
||||
#include <flags/replication.hpp>
|
||||
#include "coordination/coordinator_server.hpp"
|
||||
#include "coordination/instance_status.hpp"
|
||||
#include "coordination/raft_state.hpp"
|
||||
#include "coordination/register_main_replica_coordinator_status.hpp"
|
||||
#include "coordination/replication_instance.hpp"
|
||||
#include "utils/rw_lock.hpp"
|
||||
#include "utils/thread_pool.hpp"
|
||||
|
||||
#include <libnuraft/nuraft.hxx>
|
||||
#include <list>
|
||||
|
||||
namespace memgraph::coordination {
|
||||
|
||||
using nuraft::logger;
|
||||
using nuraft::ptr;
|
||||
using nuraft::raft_launcher;
|
||||
using nuraft::raft_server;
|
||||
using nuraft::srv_config;
|
||||
using nuraft::state_machine;
|
||||
using nuraft::state_mgr;
|
||||
|
||||
class CoordinatorInstance {
|
||||
public:
|
||||
CoordinatorInstance();
|
||||
CoordinatorInstance(CoordinatorInstance const &other) = delete;
|
||||
CoordinatorInstance &operator=(CoordinatorInstance const &other) = delete;
|
||||
CoordinatorInstance(CoordinatorInstance &&other) noexcept = delete;
|
||||
CoordinatorInstance &operator=(CoordinatorInstance &&other) noexcept = delete;
|
||||
~CoordinatorInstance() = default;
|
||||
|
||||
auto InstanceName() const -> std::string;
|
||||
auto RaftSocketAddress() const -> std::string;
|
||||
[[nodiscard]] auto RegisterReplicationInstance(CoordinatorClientConfig config) -> RegisterInstanceCoordinatorStatus;
|
||||
[[nodiscard]] auto UnregisterReplicationInstance(std::string instance_name) -> UnregisterInstanceCoordinatorStatus;
|
||||
|
||||
[[nodiscard]] auto SetReplicationInstanceToMain(std::string instance_name) -> SetInstanceToMainCoordinatorStatus;
|
||||
|
||||
auto ShowInstances() const -> std::vector<InstanceStatus>;
|
||||
|
||||
auto TryFailover() -> void;
|
||||
|
||||
auto AddCoordinatorInstance(uint32_t raft_server_id, uint32_t raft_port, std::string raft_address) -> void;
|
||||
auto GetAllCoordinators() const -> std::vector<ptr<srv_config>>;
|
||||
|
||||
auto GetMainUUID() const -> utils::UUID;
|
||||
|
||||
auto SetMainUUID(utils::UUID new_uuid) -> void;
|
||||
|
||||
private:
|
||||
ptr<state_machine> state_machine_;
|
||||
ptr<state_mgr> state_manager_;
|
||||
ptr<raft_server> raft_server_;
|
||||
ptr<logger> logger_;
|
||||
raft_launcher launcher_;
|
||||
HealthCheckCallback main_succ_cb_, main_fail_cb_, replica_succ_cb_, replica_fail_cb_;
|
||||
|
||||
// TODO: (andi) I think variables below can be abstracted
|
||||
uint32_t raft_server_id_;
|
||||
uint32_t raft_port_;
|
||||
std::string raft_address_;
|
||||
// NOTE: Must be std::list because we rely on pointer stability
|
||||
std::list<ReplicationInstance> repl_instances_;
|
||||
mutable utils::RWLock coord_instance_lock_{utils::RWLock::Priority::READ};
|
||||
|
||||
utils::UUID main_uuid_;
|
||||
|
||||
RaftState raft_state_;
|
||||
};
|
||||
|
||||
} // namespace memgraph::coordination
|
||||
|
@ -82,6 +82,85 @@ struct DemoteMainToReplicaRes {
|
||||
|
||||
using DemoteMainToReplicaRpc = rpc::RequestResponse<DemoteMainToReplicaReq, DemoteMainToReplicaRes>;
|
||||
|
||||
struct UnregisterReplicaReq {
|
||||
static const utils::TypeInfo kType;
|
||||
static const utils::TypeInfo &GetTypeInfo() { return kType; }
|
||||
|
||||
static void Load(UnregisterReplicaReq *self, memgraph::slk::Reader *reader);
|
||||
static void Save(UnregisterReplicaReq const &self, memgraph::slk::Builder *builder);
|
||||
|
||||
explicit UnregisterReplicaReq(std::string instance_name) : instance_name(std::move(instance_name)) {}
|
||||
|
||||
UnregisterReplicaReq() = default;
|
||||
|
||||
std::string instance_name;
|
||||
};
|
||||
|
||||
struct UnregisterReplicaRes {
|
||||
static const utils::TypeInfo kType;
|
||||
static const utils::TypeInfo &GetTypeInfo() { return kType; }
|
||||
|
||||
static void Load(UnregisterReplicaRes *self, memgraph::slk::Reader *reader);
|
||||
static void Save(const UnregisterReplicaRes &self, memgraph::slk::Builder *builder);
|
||||
|
||||
explicit UnregisterReplicaRes(bool success) : success(success) {}
|
||||
UnregisterReplicaRes() = default;
|
||||
|
||||
bool success;
|
||||
};
|
||||
|
||||
using UnregisterReplicaRpc = rpc::RequestResponse<UnregisterReplicaReq, UnregisterReplicaRes>;
|
||||
|
||||
struct EnableWritingOnMainReq {
|
||||
static const utils::TypeInfo kType;
|
||||
static const utils::TypeInfo &GetTypeInfo() { return kType; }
|
||||
|
||||
static void Load(EnableWritingOnMainReq *self, memgraph::slk::Reader *reader);
|
||||
static void Save(EnableWritingOnMainReq const &self, memgraph::slk::Builder *builder);
|
||||
|
||||
EnableWritingOnMainReq() = default;
|
||||
};
|
||||
|
||||
struct EnableWritingOnMainRes {
|
||||
static const utils::TypeInfo kType;
|
||||
static const utils::TypeInfo &GetTypeInfo() { return kType; }
|
||||
|
||||
static void Load(EnableWritingOnMainRes *self, memgraph::slk::Reader *reader);
|
||||
static void Save(EnableWritingOnMainRes const &self, memgraph::slk::Builder *builder);
|
||||
|
||||
explicit EnableWritingOnMainRes(bool success) : success(success) {}
|
||||
EnableWritingOnMainRes() = default;
|
||||
|
||||
bool success;
|
||||
};
|
||||
|
||||
using EnableWritingOnMainRpc = rpc::RequestResponse<EnableWritingOnMainReq, EnableWritingOnMainRes>;
|
||||
|
||||
struct GetInstanceUUIDReq {
|
||||
static const utils::TypeInfo kType;
|
||||
static const utils::TypeInfo &GetTypeInfo() { return kType; }
|
||||
|
||||
static void Load(GetInstanceUUIDReq *self, memgraph::slk::Reader *reader);
|
||||
static void Save(const GetInstanceUUIDReq &self, memgraph::slk::Builder *builder);
|
||||
|
||||
GetInstanceUUIDReq() = default;
|
||||
};
|
||||
|
||||
struct GetInstanceUUIDRes {
|
||||
static const utils::TypeInfo kType;
|
||||
static const utils::TypeInfo &GetTypeInfo() { return kType; }
|
||||
|
||||
static void Load(GetInstanceUUIDRes *self, memgraph::slk::Reader *reader);
|
||||
static void Save(const GetInstanceUUIDRes &self, memgraph::slk::Builder *builder);
|
||||
|
||||
explicit GetInstanceUUIDRes(std::optional<utils::UUID> uuid) : uuid(uuid) {}
|
||||
GetInstanceUUIDRes() = default;
|
||||
|
||||
std::optional<utils::UUID> uuid;
|
||||
};
|
||||
|
||||
using GetInstanceUUIDRpc = rpc::RequestResponse<GetInstanceUUIDReq, GetInstanceUUIDRes>;
|
||||
|
||||
} // namespace memgraph::coordination
|
||||
|
||||
// SLK serialization declarations
|
||||
@ -99,6 +178,19 @@ void Load(memgraph::coordination::DemoteMainToReplicaRes *self, memgraph::slk::R
|
||||
void Save(const memgraph::coordination::DemoteMainToReplicaReq &self, memgraph::slk::Builder *builder);
|
||||
void Load(memgraph::coordination::DemoteMainToReplicaReq *self, memgraph::slk::Reader *reader);
|
||||
|
||||
// GetInstanceUUIDRpc
|
||||
void Save(const memgraph::coordination::GetInstanceUUIDReq &self, memgraph::slk::Builder *builder);
|
||||
void Load(memgraph::coordination::GetInstanceUUIDReq *self, memgraph::slk::Reader *reader);
|
||||
void Save(const memgraph::coordination::GetInstanceUUIDRes &self, memgraph::slk::Builder *builder);
|
||||
void Load(memgraph::coordination::GetInstanceUUIDRes *self, memgraph::slk::Reader *reader);
|
||||
// UnregisterReplicaRpc
|
||||
void Save(memgraph::coordination::UnregisterReplicaRes const &self, memgraph::slk::Builder *builder);
|
||||
void Load(memgraph::coordination::UnregisterReplicaRes *self, memgraph::slk::Reader *reader);
|
||||
void Save(memgraph::coordination::UnregisterReplicaReq const &self, memgraph::slk::Builder *builder);
|
||||
void Load(memgraph::coordination::UnregisterReplicaReq *self, memgraph::slk::Reader *reader);
|
||||
|
||||
void Save(memgraph::coordination::EnableWritingOnMainRes const &self, memgraph::slk::Builder *builder);
|
||||
void Load(memgraph::coordination::EnableWritingOnMainRes *self, memgraph::slk::Reader *reader);
|
||||
|
||||
} // namespace memgraph::slk
|
||||
|
||||
|
@ -13,7 +13,7 @@
|
||||
|
||||
#ifdef MG_ENTERPRISE
|
||||
|
||||
#include "coordination/coordinator_data.hpp"
|
||||
#include "coordination/coordinator_instance.hpp"
|
||||
#include "coordination/coordinator_server.hpp"
|
||||
#include "coordination/instance_status.hpp"
|
||||
#include "coordination/register_main_replica_coordinator_status.hpp"
|
||||
@ -33,19 +33,24 @@ class CoordinatorState {
|
||||
CoordinatorState(CoordinatorState &&) noexcept = delete;
|
||||
CoordinatorState &operator=(CoordinatorState &&) noexcept = delete;
|
||||
|
||||
[[nodiscard]] auto RegisterInstance(CoordinatorClientConfig config) -> RegisterInstanceCoordinatorStatus;
|
||||
[[nodiscard]] auto RegisterReplicationInstance(CoordinatorClientConfig config) -> RegisterInstanceCoordinatorStatus;
|
||||
[[nodiscard]] auto UnregisterReplicationInstance(std::string instance_name) -> UnregisterInstanceCoordinatorStatus;
|
||||
|
||||
[[nodiscard]] auto SetInstanceToMain(std::string instance_name) -> SetInstanceToMainCoordinatorStatus;
|
||||
[[nodiscard]] auto SetReplicationInstanceToMain(std::string instance_name) -> SetInstanceToMainCoordinatorStatus;
|
||||
|
||||
auto ShowInstances() const -> std::vector<InstanceStatus>;
|
||||
|
||||
auto AddCoordinatorInstance(uint32_t raft_server_id, uint32_t raft_port, std::string raft_address) -> void;
|
||||
|
||||
// The client code must check that the server exists before calling this method.
|
||||
// NOTE: The client code must check that the server exists before calling this method.
|
||||
auto GetCoordinatorServer() const -> CoordinatorServer &;
|
||||
|
||||
private:
|
||||
std::variant<CoordinatorData, CoordinatorMainReplicaData> data_;
|
||||
struct CoordinatorMainReplicaData {
|
||||
std::unique_ptr<CoordinatorServer> coordinator_server_;
|
||||
};
|
||||
|
||||
std::variant<CoordinatorInstance, CoordinatorMainReplicaData> data_;
|
||||
};
|
||||
|
||||
} // namespace memgraph::coordination
|
||||
|
79
src/coordination/include/coordination/raft_state.hpp
Normal file
79
src/coordination/include/coordination/raft_state.hpp
Normal file
@ -0,0 +1,79 @@
|
||||
// Copyright 2024 Memgraph Ltd.
|
||||
//
|
||||
// Use of this software is governed by the Business Source License
|
||||
// included in the file licenses/BSL.txt; by using this file, you agree to be bound by the terms of the Business Source
|
||||
// License, and you may not use this file except in compliance with the Business Source License.
|
||||
//
|
||||
// As of the Change Date specified in that file, in accordance with
|
||||
// the Business Source License, use of this software will be governed
|
||||
// by the Apache License, Version 2.0, included in the file
|
||||
// licenses/APL.txt.
|
||||
|
||||
#pragma once
|
||||
|
||||
#ifdef MG_ENTERPRISE
|
||||
|
||||
#include <flags/replication.hpp>
|
||||
|
||||
#include <libnuraft/nuraft.hxx>
|
||||
|
||||
namespace memgraph::coordination {
|
||||
|
||||
using BecomeLeaderCb = std::function<void()>;
|
||||
using BecomeFollowerCb = std::function<void()>;
|
||||
|
||||
using nuraft::buffer;
|
||||
using nuraft::logger;
|
||||
using nuraft::ptr;
|
||||
using nuraft::raft_launcher;
|
||||
using nuraft::raft_server;
|
||||
using nuraft::srv_config;
|
||||
using nuraft::state_machine;
|
||||
using nuraft::state_mgr;
|
||||
using raft_result = nuraft::cmd_result<ptr<buffer>>;
|
||||
|
||||
class RaftState {
|
||||
private:
|
||||
explicit RaftState(BecomeLeaderCb become_leader_cb, BecomeFollowerCb become_follower_cb, uint32_t raft_server_id,
|
||||
uint32_t raft_port, std::string raft_address);
|
||||
|
||||
auto InitRaftServer() -> void;
|
||||
|
||||
public:
|
||||
RaftState() = delete;
|
||||
RaftState(RaftState const &other) = default;
|
||||
RaftState &operator=(RaftState const &other) = default;
|
||||
RaftState(RaftState &&other) noexcept = default;
|
||||
RaftState &operator=(RaftState &&other) noexcept = default;
|
||||
~RaftState();
|
||||
|
||||
static auto MakeRaftState(BecomeLeaderCb become_leader_cb, BecomeFollowerCb become_follower_cb) -> RaftState;
|
||||
|
||||
auto InstanceName() const -> std::string;
|
||||
auto RaftSocketAddress() const -> std::string;
|
||||
|
||||
auto AddCoordinatorInstance(uint32_t raft_server_id, uint32_t raft_port, std::string raft_address) -> void;
|
||||
auto GetAllCoordinators() const -> std::vector<ptr<srv_config>>;
|
||||
|
||||
auto RequestLeadership() -> bool;
|
||||
auto IsLeader() const -> bool;
|
||||
|
||||
auto AppendRegisterReplicationInstance(std::string const &instance) -> ptr<raft_result>;
|
||||
|
||||
// TODO: (andi) I think variables below can be abstracted
|
||||
uint32_t raft_server_id_;
|
||||
uint32_t raft_port_;
|
||||
std::string raft_address_;
|
||||
|
||||
ptr<state_machine> state_machine_;
|
||||
ptr<state_mgr> state_manager_;
|
||||
ptr<raft_server> raft_server_;
|
||||
ptr<logger> logger_;
|
||||
raft_launcher launcher_;
|
||||
|
||||
BecomeLeaderCb become_leader_cb_;
|
||||
BecomeFollowerCb become_follower_cb_;
|
||||
};
|
||||
|
||||
} // namespace memgraph::coordination
|
||||
#endif
|
@ -22,11 +22,24 @@ enum class RegisterInstanceCoordinatorStatus : uint8_t {
|
||||
ENDPOINT_EXISTS,
|
||||
NOT_COORDINATOR,
|
||||
RPC_FAILED,
|
||||
NOT_LEADER,
|
||||
RAFT_COULD_NOT_ACCEPT,
|
||||
RAFT_COULD_NOT_APPEND,
|
||||
SUCCESS
|
||||
};
|
||||
|
||||
enum class UnregisterInstanceCoordinatorStatus : uint8_t {
|
||||
NO_INSTANCE_WITH_NAME,
|
||||
IS_MAIN,
|
||||
NOT_COORDINATOR,
|
||||
NOT_LEADER,
|
||||
RPC_FAILED,
|
||||
SUCCESS,
|
||||
};
|
||||
|
||||
enum class SetInstanceToMainCoordinatorStatus : uint8_t {
|
||||
NO_INSTANCE_WITH_NAME,
|
||||
MAIN_ALREADY_EXISTS,
|
||||
NOT_COORDINATOR,
|
||||
SUCCESS,
|
||||
COULD_NOT_PROMOTE_TO_MAIN,
|
||||
|
@ -14,20 +14,20 @@
|
||||
#ifdef MG_ENTERPRISE
|
||||
|
||||
#include "coordination/coordinator_client.hpp"
|
||||
#include "coordination/coordinator_cluster_config.hpp"
|
||||
#include "coordination/coordinator_exceptions.hpp"
|
||||
#include "replication_coordination_glue/role.hpp"
|
||||
|
||||
#include <libnuraft/nuraft.hxx>
|
||||
#include "utils/result.hpp"
|
||||
#include "utils/uuid.hpp"
|
||||
|
||||
namespace memgraph::coordination {
|
||||
|
||||
class CoordinatorData;
|
||||
class CoordinatorInstance;
|
||||
|
||||
class ReplicationInstance {
|
||||
public:
|
||||
ReplicationInstance(CoordinatorData *data, CoordinatorClientConfig config, HealthCheckCallback succ_cb,
|
||||
ReplicationInstance(CoordinatorInstance *peer, CoordinatorClientConfig config, HealthCheckCallback succ_cb,
|
||||
HealthCheckCallback fail_cb);
|
||||
|
||||
ReplicationInstance(ReplicationInstance const &other) = delete;
|
||||
@ -38,6 +38,9 @@ class ReplicationInstance {
|
||||
|
||||
auto OnSuccessPing() -> void;
|
||||
auto OnFailPing() -> bool;
|
||||
auto IsReadyForUUIDPing() -> bool;
|
||||
|
||||
void UpdateReplicaLastResponseUUID();
|
||||
|
||||
auto IsAlive() const -> bool;
|
||||
|
||||
@ -51,22 +54,34 @@ class ReplicationInstance {
|
||||
HealthCheckCallback main_fail_cb) -> bool;
|
||||
auto DemoteToReplica(HealthCheckCallback replica_succ_cb, HealthCheckCallback replica_fail_cb) -> bool;
|
||||
|
||||
auto StartFrequentCheck() -> void;
|
||||
auto StopFrequentCheck() -> void;
|
||||
auto PauseFrequentCheck() -> void;
|
||||
auto ResumeFrequentCheck() -> void;
|
||||
|
||||
auto ReplicationClientInfo() const -> ReplClientInfo;
|
||||
|
||||
auto SendSwapAndUpdateUUID(const utils::UUID &main_uuid) -> bool;
|
||||
auto EnsureReplicaHasCorrectMainUUID(utils::UUID const &curr_main_uuid) -> bool;
|
||||
|
||||
auto SendSwapAndUpdateUUID(const utils::UUID &new_main_uuid) -> bool;
|
||||
auto SendUnregisterReplicaRpc(std::string const &instance_name) -> bool;
|
||||
|
||||
|
||||
auto SendGetInstanceUUID() -> utils::BasicResult<coordination::GetInstanceUUIDError, std::optional<utils::UUID>>;
|
||||
auto GetClient() -> CoordinatorClient &;
|
||||
|
||||
void SetNewMainUUID(const std::optional<utils::UUID> &main_uuid = std::nullopt);
|
||||
auto GetMainUUID() -> const std::optional<utils::UUID> &;
|
||||
auto EnableWritingOnMain() -> bool;
|
||||
|
||||
auto SetNewMainUUID(utils::UUID const &main_uuid) -> void;
|
||||
auto ResetMainUUID() -> void;
|
||||
auto GetMainUUID() const -> const std::optional<utils::UUID> &;
|
||||
|
||||
private:
|
||||
CoordinatorClient client_;
|
||||
replication_coordination_glue::ReplicationRole replication_role_;
|
||||
std::chrono::system_clock::time_point last_response_time_{};
|
||||
bool is_alive_{false};
|
||||
std::chrono::system_clock::time_point last_check_of_uuid_{};
|
||||
|
||||
// for replica this is main uuid of current main
|
||||
// for "main" main this same as in CoordinatorData
|
||||
|
@ -9,14 +9,6 @@
|
||||
// by the Apache License, Version 2.0, included in the file
|
||||
// licenses/APL.txt.
|
||||
|
||||
#pragma once
|
||||
|
||||
#ifdef MG_ENTERPRISE
|
||||
namespace memgraph::coordination {
|
||||
|
||||
struct CoordinatorClusterConfig {
|
||||
static constexpr int alive_response_time_difference_sec_{5};
|
||||
};
|
||||
|
||||
enum class GetInstanceUUIDError { NO_RESPONSE, RPC_EXCEPTION };
|
||||
} // namespace memgraph::coordination
|
||||
#endif
|
@ -46,9 +46,6 @@ class CoordinatorLogStore : public log_store {
|
||||
|
||||
ptr<std::vector<ptr<log_entry>>> log_entries(ulong start, ulong end) override;
|
||||
|
||||
// NOLINTNEXTLINE
|
||||
ptr<std::vector<ptr<log_entry>>> log_entries_ext(ulong start, ulong end, int64 batch_size_hint_in_bytes = 0) override;
|
||||
|
||||
ptr<log_entry> entry_at(ulong index) override;
|
||||
|
||||
ulong term_at(ulong index) override;
|
||||
@ -61,67 +58,12 @@ class CoordinatorLogStore : public log_store {
|
||||
|
||||
bool flush() override;
|
||||
|
||||
ulong last_durable_index() override;
|
||||
|
||||
void Close();
|
||||
|
||||
void SetDiskDelay(raft_server *raft, size_t delay_ms);
|
||||
|
||||
private:
|
||||
static ptr<log_entry> MakeClone(ptr<log_entry> const &entry);
|
||||
auto FindOrDefault_(ulong index) const -> ptr<log_entry>;
|
||||
|
||||
void DiskEmulLoop();
|
||||
|
||||
/**
|
||||
* Map of <log index, log data>.
|
||||
*/
|
||||
std::map<ulong, ptr<log_entry>> logs_;
|
||||
|
||||
/**
|
||||
* Lock for `logs_`.
|
||||
*/
|
||||
mutable std::mutex logs_lock_;
|
||||
|
||||
/**
|
||||
* The index of the first log.
|
||||
*/
|
||||
std::atomic<ulong> start_idx_;
|
||||
|
||||
/**
|
||||
* Backward pointer to Raft server.
|
||||
*/
|
||||
raft_server *raft_server_bwd_pointer_;
|
||||
|
||||
// Testing purpose --------------- BEGIN
|
||||
|
||||
/**
|
||||
* If non-zero, this log store will emulate the disk write delay.
|
||||
*/
|
||||
std::atomic<size_t> disk_emul_delay;
|
||||
|
||||
/**
|
||||
* Map of <timestamp, log index>, emulating logs that is being written to disk.
|
||||
* Log index will be regarded as "durable" after the corresponding timestamp.
|
||||
*/
|
||||
std::map<uint64_t, uint64_t> disk_emul_logs_being_written_;
|
||||
|
||||
/**
|
||||
* Thread that will update `last_durable_index_` and call
|
||||
* `notify_log_append_completion` at proper time.
|
||||
*/
|
||||
std::unique_ptr<std::thread> disk_emul_thread_;
|
||||
|
||||
/**
|
||||
* Flag to terminate the thread.
|
||||
*/
|
||||
std::atomic<bool> disk_emul_thread_stop_signal_;
|
||||
|
||||
/**
|
||||
* Last written log index.
|
||||
*/
|
||||
std::atomic<uint64_t> disk_emul_last_durable_index_;
|
||||
|
||||
// Testing purpose --------------- END
|
||||
};
|
||||
|
||||
} // namespace memgraph::coordination
|
||||
|
@ -36,6 +36,10 @@ class CoordinatorStateMachine : public state_machine {
|
||||
CoordinatorStateMachine &operator=(CoordinatorStateMachine &&) = delete;
|
||||
~CoordinatorStateMachine() override {}
|
||||
|
||||
static auto EncodeRegisterReplicationInstance(const std::string &name) -> ptr<buffer>;
|
||||
|
||||
static auto DecodeRegisterReplicationInstance(buffer &data) -> std::string;
|
||||
|
||||
auto pre_commit(ulong log_idx, buffer &data) -> ptr<buffer> override;
|
||||
|
||||
auto commit(ulong log_idx, buffer &data) -> ptr<buffer> override;
|
||||
|
140
src/coordination/raft_state.cpp
Normal file
140
src/coordination/raft_state.cpp
Normal file
@ -0,0 +1,140 @@
|
||||
// Copyright 2024 Memgraph Ltd.
|
||||
//
|
||||
// Use of this software is governed by the Business Source License
|
||||
// included in the file licenses/BSL.txt; by using this file, you agree to be bound by the terms of the Business Source
|
||||
// License, and you may not use this file except in compliance with the Business Source License.
|
||||
//
|
||||
// As of the Change Date specified in that file, in accordance with
|
||||
// the Business Source License, use of this software will be governed
|
||||
// by the Apache License, Version 2.0, included in the file
|
||||
// licenses/APL.txt.
|
||||
|
||||
#ifdef MG_ENTERPRISE
|
||||
|
||||
#include "coordination/raft_state.hpp"
|
||||
|
||||
#include "coordination/coordinator_exceptions.hpp"
|
||||
#include "nuraft/coordinator_state_machine.hpp"
|
||||
#include "nuraft/coordinator_state_manager.hpp"
|
||||
#include "utils/counter.hpp"
|
||||
|
||||
namespace memgraph::coordination {
|
||||
|
||||
using nuraft::asio_service;
|
||||
using nuraft::cb_func;
|
||||
using nuraft::CbReturnCode;
|
||||
using nuraft::cmd_result;
|
||||
using nuraft::cs_new;
|
||||
using nuraft::ptr;
|
||||
using nuraft::raft_params;
|
||||
using nuraft::raft_server;
|
||||
using nuraft::srv_config;
|
||||
using raft_result = cmd_result<ptr<buffer>>;
|
||||
|
||||
RaftState::RaftState(BecomeLeaderCb become_leader_cb, BecomeFollowerCb become_follower_cb, uint32_t raft_server_id,
|
||||
uint32_t raft_port, std::string raft_address)
|
||||
: raft_server_id_(raft_server_id),
|
||||
raft_port_(raft_port),
|
||||
raft_address_(std::move(raft_address)),
|
||||
state_machine_(cs_new<CoordinatorStateMachine>()),
|
||||
state_manager_(
|
||||
cs_new<CoordinatorStateManager>(raft_server_id_, raft_address_ + ":" + std::to_string(raft_port_))),
|
||||
logger_(nullptr),
|
||||
become_leader_cb_(std::move(become_leader_cb)),
|
||||
become_follower_cb_(std::move(become_follower_cb)) {}
|
||||
|
||||
auto RaftState::InitRaftServer() -> void {
|
||||
asio_service::options asio_opts;
|
||||
asio_opts.thread_pool_size_ = 1; // TODO: (andi) Improve this
|
||||
|
||||
raft_params params;
|
||||
params.heart_beat_interval_ = 100;
|
||||
params.election_timeout_lower_bound_ = 200;
|
||||
params.election_timeout_upper_bound_ = 400;
|
||||
// 5 logs are preserved before the last snapshot
|
||||
params.reserved_log_items_ = 5;
|
||||
// Create snapshot for every 5 log appends
|
||||
params.snapshot_distance_ = 5;
|
||||
params.client_req_timeout_ = 3000;
|
||||
params.return_method_ = raft_params::blocking;
|
||||
|
||||
raft_server::init_options init_opts;
|
||||
init_opts.raft_callback_ = [this](cb_func::Type event_type, cb_func::Param *param) -> nuraft::CbReturnCode {
|
||||
if (event_type == cb_func::BecomeLeader) {
|
||||
spdlog::info("Node {} became leader", param->leaderId);
|
||||
become_leader_cb_();
|
||||
} else if (event_type == cb_func::BecomeFollower) {
|
||||
spdlog::info("Node {} became follower", param->myId);
|
||||
become_follower_cb_();
|
||||
}
|
||||
return CbReturnCode::Ok;
|
||||
};
|
||||
|
||||
raft_launcher launcher;
|
||||
|
||||
raft_server_ = launcher.init(state_machine_, state_manager_, logger_, static_cast<int>(raft_port_), asio_opts, params,
|
||||
init_opts);
|
||||
|
||||
if (!raft_server_) {
|
||||
throw RaftServerStartException("Failed to launch raft server on {}:{}", raft_address_, raft_port_);
|
||||
}
|
||||
|
||||
auto maybe_stop = utils::ResettableCounter<20>();
|
||||
do {
|
||||
if (raft_server_->is_initialized()) {
|
||||
return;
|
||||
}
|
||||
std::this_thread::sleep_for(std::chrono::milliseconds(250));
|
||||
} while (!maybe_stop());
|
||||
|
||||
throw RaftServerStartException("Failed to initialize raft server on {}:{}", raft_address_, raft_port_);
|
||||
}
|
||||
|
||||
auto RaftState::MakeRaftState(BecomeLeaderCb become_leader_cb, BecomeFollowerCb become_follower_cb) -> RaftState {
|
||||
uint32_t raft_server_id{0};
|
||||
uint32_t raft_port{0};
|
||||
try {
|
||||
raft_server_id = FLAGS_raft_server_id;
|
||||
raft_port = FLAGS_raft_server_port;
|
||||
} catch (std::exception const &e) {
|
||||
throw RaftCouldNotParseFlagsException("Failed to parse flags: {}", e.what());
|
||||
}
|
||||
|
||||
auto raft_state =
|
||||
RaftState(std::move(become_leader_cb), std::move(become_follower_cb), raft_server_id, raft_port, "127.0.0.1");
|
||||
raft_state.InitRaftServer();
|
||||
return raft_state;
|
||||
}
|
||||
|
||||
RaftState::~RaftState() { launcher_.shutdown(); }
|
||||
|
||||
auto RaftState::InstanceName() const -> std::string { return "coordinator_" + std::to_string(raft_server_id_); }
|
||||
|
||||
auto RaftState::RaftSocketAddress() const -> std::string { return raft_address_ + ":" + std::to_string(raft_port_); }
|
||||
|
||||
auto RaftState::AddCoordinatorInstance(uint32_t raft_server_id, uint32_t raft_port, std::string raft_address) -> void {
|
||||
auto const endpoint = raft_address + ":" + std::to_string(raft_port);
|
||||
srv_config const srv_config_to_add(static_cast<int>(raft_server_id), endpoint);
|
||||
if (!raft_server_->add_srv(srv_config_to_add)->get_accepted()) {
|
||||
throw RaftAddServerException("Failed to add server {} to the cluster", endpoint);
|
||||
}
|
||||
spdlog::info("Request to add server {} to the cluster accepted", endpoint);
|
||||
}
|
||||
|
||||
auto RaftState::GetAllCoordinators() const -> std::vector<ptr<srv_config>> {
|
||||
std::vector<ptr<srv_config>> all_srv_configs;
|
||||
raft_server_->get_srv_config_all(all_srv_configs);
|
||||
return all_srv_configs;
|
||||
}
|
||||
|
||||
auto RaftState::IsLeader() const -> bool { return raft_server_->is_leader(); }
|
||||
|
||||
auto RaftState::RequestLeadership() -> bool { return raft_server_->is_leader() || raft_server_->request_leadership(); }
|
||||
|
||||
auto RaftState::AppendRegisterReplicationInstance(std::string const &instance) -> ptr<raft_result> {
|
||||
auto new_log = CoordinatorStateMachine::EncodeRegisterReplicationInstance(instance);
|
||||
return raft_server_->append_entries({new_log});
|
||||
}
|
||||
|
||||
} // namespace memgraph::coordination
|
||||
#endif
|
@ -14,17 +14,18 @@
|
||||
#include "coordination/replication_instance.hpp"
|
||||
|
||||
#include "replication_coordination_glue/handler.hpp"
|
||||
#include "utils/result.hpp"
|
||||
|
||||
namespace memgraph::coordination {
|
||||
|
||||
ReplicationInstance::ReplicationInstance(CoordinatorData *data, CoordinatorClientConfig config,
|
||||
ReplicationInstance::ReplicationInstance(CoordinatorInstance *peer, CoordinatorClientConfig config,
|
||||
HealthCheckCallback succ_cb, HealthCheckCallback fail_cb)
|
||||
: client_(data, std::move(config), std::move(succ_cb), std::move(fail_cb)),
|
||||
replication_role_(replication_coordination_glue::ReplicationRole::REPLICA),
|
||||
is_alive_(true) {
|
||||
: client_(peer, std::move(config), std::move(succ_cb), std::move(fail_cb)),
|
||||
replication_role_(replication_coordination_glue::ReplicationRole::REPLICA) {
|
||||
if (!client_.DemoteToReplica()) {
|
||||
throw CoordinatorRegisterInstanceException("Failed to demote instance {} to replica", client_.InstanceName());
|
||||
}
|
||||
|
||||
client_.StartFrequentCheck();
|
||||
}
|
||||
|
||||
@ -34,12 +35,16 @@ auto ReplicationInstance::OnSuccessPing() -> void {
|
||||
}
|
||||
|
||||
auto ReplicationInstance::OnFailPing() -> bool {
|
||||
is_alive_ =
|
||||
std::chrono::duration_cast<std::chrono::seconds>(std::chrono::system_clock::now() - last_response_time_).count() <
|
||||
CoordinatorClusterConfig::alive_response_time_difference_sec_;
|
||||
auto elapsed_time = std::chrono::system_clock::now() - last_response_time_;
|
||||
is_alive_ = elapsed_time < client_.InstanceDownTimeoutSec();
|
||||
return is_alive_;
|
||||
}
|
||||
|
||||
auto ReplicationInstance::IsReadyForUUIDPing() -> bool {
|
||||
return std::chrono::duration_cast<std::chrono::seconds>(std::chrono::system_clock::now() - last_check_of_uuid_) >
|
||||
client_.InstanceGetUUIDFrequencySec();
|
||||
}
|
||||
|
||||
auto ReplicationInstance::InstanceName() const -> std::string { return client_.InstanceName(); }
|
||||
auto ReplicationInstance::SocketAddress() const -> std::string { return client_.SocketAddress(); }
|
||||
auto ReplicationInstance::IsAlive() const -> bool { return is_alive_; }
|
||||
@ -51,13 +56,14 @@ auto ReplicationInstance::IsMain() const -> bool {
|
||||
return replication_role_ == replication_coordination_glue::ReplicationRole::MAIN;
|
||||
}
|
||||
|
||||
auto ReplicationInstance::PromoteToMain(utils::UUID uuid, ReplicationClientsInfo repl_clients_info,
|
||||
auto ReplicationInstance::PromoteToMain(utils::UUID new_uuid, ReplicationClientsInfo repl_clients_info,
|
||||
HealthCheckCallback main_succ_cb, HealthCheckCallback main_fail_cb) -> bool {
|
||||
if (!client_.SendPromoteReplicaToMainRpc(uuid, std::move(repl_clients_info))) {
|
||||
if (!client_.SendPromoteReplicaToMainRpc(new_uuid, std::move(repl_clients_info))) {
|
||||
return false;
|
||||
}
|
||||
|
||||
replication_role_ = replication_coordination_glue::ReplicationRole::MAIN;
|
||||
main_uuid_ = new_uuid;
|
||||
client_.SetCallbacks(std::move(main_succ_cb), std::move(main_fail_cb));
|
||||
|
||||
return true;
|
||||
@ -75,6 +81,8 @@ auto ReplicationInstance::DemoteToReplica(HealthCheckCallback replica_succ_cb, H
|
||||
return true;
|
||||
}
|
||||
|
||||
auto ReplicationInstance::StartFrequentCheck() -> void { client_.StartFrequentCheck(); }
|
||||
auto ReplicationInstance::StopFrequentCheck() -> void { client_.StopFrequentCheck(); }
|
||||
auto ReplicationInstance::PauseFrequentCheck() -> void { client_.PauseFrequentCheck(); }
|
||||
auto ReplicationInstance::ResumeFrequentCheck() -> void { client_.ResumeFrequentCheck(); }
|
||||
|
||||
@ -83,16 +91,48 @@ auto ReplicationInstance::ReplicationClientInfo() const -> CoordinatorClientConf
|
||||
}
|
||||
|
||||
auto ReplicationInstance::GetClient() -> CoordinatorClient & { return client_; }
|
||||
void ReplicationInstance::SetNewMainUUID(const std::optional<utils::UUID> &main_uuid) { main_uuid_ = main_uuid; }
|
||||
auto ReplicationInstance::GetMainUUID() -> const std::optional<utils::UUID> & { return main_uuid_; }
|
||||
|
||||
auto ReplicationInstance::SendSwapAndUpdateUUID(const utils::UUID &main_uuid) -> bool {
|
||||
if (!replication_coordination_glue::SendSwapMainUUIDRpc(client_.RpcClient(), main_uuid)) {
|
||||
auto ReplicationInstance::SetNewMainUUID(utils::UUID const &main_uuid) -> void { main_uuid_ = main_uuid; }
|
||||
auto ReplicationInstance::ResetMainUUID() -> void { main_uuid_ = std::nullopt; }
|
||||
auto ReplicationInstance::GetMainUUID() const -> std::optional<utils::UUID> const & { return main_uuid_; }
|
||||
|
||||
auto ReplicationInstance::EnsureReplicaHasCorrectMainUUID(utils::UUID const &curr_main_uuid) -> bool {
|
||||
if (!IsReadyForUUIDPing()) {
|
||||
return true;
|
||||
}
|
||||
auto res = SendGetInstanceUUID();
|
||||
if (res.HasError()) {
|
||||
return false;
|
||||
}
|
||||
SetNewMainUUID(main_uuid_);
|
||||
UpdateReplicaLastResponseUUID();
|
||||
|
||||
if (res.GetValue().has_value() && res.GetValue().value() == curr_main_uuid) {
|
||||
return true;
|
||||
}
|
||||
|
||||
return SendSwapAndUpdateUUID(curr_main_uuid);
|
||||
}
|
||||
|
||||
auto ReplicationInstance::SendSwapAndUpdateUUID(const utils::UUID &new_main_uuid) -> bool {
|
||||
if (!replication_coordination_glue::SendSwapMainUUIDRpc(client_.RpcClient(), new_main_uuid)) {
|
||||
return false;
|
||||
}
|
||||
SetNewMainUUID(new_main_uuid);
|
||||
return true;
|
||||
}
|
||||
|
||||
auto ReplicationInstance::SendUnregisterReplicaRpc(std::string const &instance_name) -> bool {
|
||||
return client_.SendUnregisterReplicaRpc(instance_name);
|
||||
}
|
||||
|
||||
auto ReplicationInstance::EnableWritingOnMain() -> bool { return client_.SendEnableWritingOnMainRpc(); }
|
||||
|
||||
auto ReplicationInstance::SendGetInstanceUUID()
|
||||
-> utils::BasicResult<coordination::GetInstanceUUIDError, std::optional<utils::UUID>> {
|
||||
return client_.SendGetInstanceUUIDRpc();
|
||||
}
|
||||
|
||||
void ReplicationInstance::UpdateReplicaLastResponseUUID() { last_check_of_uuid_ = std::chrono::system_clock::now(); }
|
||||
|
||||
} // namespace memgraph::coordination
|
||||
#endif
|
||||
|
@ -16,10 +16,4 @@ namespace memgraph::dbms {
|
||||
constexpr std::string_view kDefaultDB = "memgraph"; //!< Name of the default database
|
||||
constexpr std::string_view kMultiTenantDir = "databases"; //!< Name of the multi-tenant directory
|
||||
|
||||
#ifdef MG_EXPERIMENTAL_REPLICATION_MULTITENANCY
|
||||
constexpr bool allow_mt_repl = true;
|
||||
#else
|
||||
constexpr bool allow_mt_repl = false;
|
||||
#endif
|
||||
|
||||
} // namespace memgraph::dbms
|
||||
|
@ -20,14 +20,19 @@ namespace memgraph::dbms {
|
||||
CoordinatorHandler::CoordinatorHandler(coordination::CoordinatorState &coordinator_state)
|
||||
: coordinator_state_(coordinator_state) {}
|
||||
|
||||
auto CoordinatorHandler::RegisterInstance(memgraph::coordination::CoordinatorClientConfig config)
|
||||
auto CoordinatorHandler::RegisterReplicationInstance(memgraph::coordination::CoordinatorClientConfig config)
|
||||
-> coordination::RegisterInstanceCoordinatorStatus {
|
||||
return coordinator_state_.RegisterInstance(config);
|
||||
return coordinator_state_.RegisterReplicationInstance(config);
|
||||
}
|
||||
|
||||
auto CoordinatorHandler::SetInstanceToMain(std::string instance_name)
|
||||
auto CoordinatorHandler::UnregisterReplicationInstance(std::string instance_name)
|
||||
-> coordination::UnregisterInstanceCoordinatorStatus {
|
||||
return coordinator_state_.UnregisterReplicationInstance(std::move(instance_name));
|
||||
}
|
||||
|
||||
auto CoordinatorHandler::SetReplicationInstanceToMain(std::string instance_name)
|
||||
-> coordination::SetInstanceToMainCoordinatorStatus {
|
||||
return coordinator_state_.SetInstanceToMain(std::move(instance_name));
|
||||
return coordinator_state_.SetReplicationInstanceToMain(std::move(instance_name));
|
||||
}
|
||||
|
||||
auto CoordinatorHandler::ShowInstances() const -> std::vector<coordination::InstanceStatus> {
|
||||
|
@ -28,10 +28,14 @@ class CoordinatorHandler {
|
||||
public:
|
||||
explicit CoordinatorHandler(coordination::CoordinatorState &coordinator_state);
|
||||
|
||||
auto RegisterInstance(coordination::CoordinatorClientConfig config)
|
||||
// TODO: (andi) When moving coordinator state on same instances, rename from RegisterReplicationInstance to
|
||||
// RegisterInstance
|
||||
auto RegisterReplicationInstance(coordination::CoordinatorClientConfig config)
|
||||
-> coordination::RegisterInstanceCoordinatorStatus;
|
||||
|
||||
auto SetInstanceToMain(std::string instance_name) -> coordination::SetInstanceToMainCoordinatorStatus;
|
||||
auto UnregisterReplicationInstance(std::string instance_name) -> coordination::UnregisterInstanceCoordinatorStatus;
|
||||
|
||||
auto SetReplicationInstanceToMain(std::string instance_name) -> coordination::SetInstanceToMainCoordinatorStatus;
|
||||
|
||||
auto ShowInstances() const -> std::vector<coordination::InstanceStatus>;
|
||||
|
||||
|
@ -16,6 +16,7 @@
|
||||
|
||||
#include "dbms/constants.hpp"
|
||||
#include "dbms/global.hpp"
|
||||
#include "flags/experimental.hpp"
|
||||
#include "spdlog/spdlog.h"
|
||||
#include "system/include/system/system.hpp"
|
||||
#include "utils/exceptions.hpp"
|
||||
@ -158,9 +159,9 @@ struct Durability {
|
||||
}
|
||||
};
|
||||
|
||||
DbmsHandler::DbmsHandler(storage::Config config, memgraph::system::System &system,
|
||||
replication::ReplicationState &repl_state, auth::SynchedAuth &auth, bool recovery_on_startup)
|
||||
: default_config_{std::move(config)}, auth_{auth}, repl_state_{repl_state}, system_{&system} {
|
||||
DbmsHandler::DbmsHandler(storage::Config config, replication::ReplicationState &repl_state, auth::SynchedAuth &auth,
|
||||
bool recovery_on_startup)
|
||||
: default_config_{std::move(config)}, auth_{auth}, repl_state_{repl_state} {
|
||||
// TODO: Decouple storage config from dbms config
|
||||
// TODO: Save individual db configs inside the kvstore and restore from there
|
||||
|
||||
@ -419,7 +420,10 @@ void DbmsHandler::UpdateDurability(const storage::Config &config, std::optional<
|
||||
#endif
|
||||
|
||||
void DbmsHandler::RecoverStorageReplication(DatabaseAccess db_acc, replication::RoleMainData &role_main_data) {
|
||||
if (allow_mt_repl || db_acc->name() == dbms::kDefaultDB) {
|
||||
using enum memgraph::flags::Experiments;
|
||||
auto const is_enterprise = license::global_license_checker.IsEnterpriseValidFast();
|
||||
auto experimental_system_replication = flags::AreExperimentsEnabled(SYSTEM_REPLICATION);
|
||||
if ((is_enterprise && experimental_system_replication) || db_acc->name() == dbms::kDefaultDB) {
|
||||
// Handle global replication state
|
||||
spdlog::info("Replication configuration will be stored and will be automatically restored in case of a crash.");
|
||||
// RECOVER REPLICA CONNECTIONS
|
||||
|
@ -107,8 +107,7 @@ class DbmsHandler {
|
||||
* @param auth pointer to the global authenticator
|
||||
* @param recovery_on_startup restore databases (and its content) and authentication data
|
||||
*/
|
||||
DbmsHandler(storage::Config config, memgraph::system::System &system, replication::ReplicationState &repl_state,
|
||||
auth::SynchedAuth &auth,
|
||||
DbmsHandler(storage::Config config, replication::ReplicationState &repl_state, auth::SynchedAuth &auth,
|
||||
bool recovery_on_startup); // TODO If more arguments are added use a config struct
|
||||
#else
|
||||
/**
|
||||
@ -116,9 +115,8 @@ class DbmsHandler {
|
||||
*
|
||||
* @param configs storage configuration
|
||||
*/
|
||||
DbmsHandler(storage::Config config, memgraph::system::System &system, replication::ReplicationState &repl_state)
|
||||
DbmsHandler(storage::Config config, replication::ReplicationState &repl_state)
|
||||
: repl_state_{repl_state},
|
||||
system_{&system},
|
||||
db_gatekeeper_{[&] {
|
||||
config.salient.name = kDefaultDB;
|
||||
return std::move(config);
|
||||
@ -272,6 +270,20 @@ class DbmsHandler {
|
||||
// coordination::CoordinatorState &CoordinatorState() { return coordinator_state_; }
|
||||
#endif
|
||||
|
||||
/**
|
||||
* @brief Return all active databases.
|
||||
*
|
||||
* @return std::vector<std::string>
|
||||
*/
|
||||
auto Count() const -> std::size_t {
|
||||
#ifdef MG_ENTERPRISE
|
||||
std::shared_lock<LockT> rd(lock_);
|
||||
return db_handler_.size();
|
||||
#else
|
||||
return 1;
|
||||
#endif
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Return the statistics all databases.
|
||||
*
|
||||
@ -587,9 +599,6 @@ class DbmsHandler {
|
||||
// current replication role. TODO: make Database Access explicit about the role and remove this from
|
||||
// dbms stuff
|
||||
replication::ReplicationState &repl_state_; //!< Ref to global replication state
|
||||
public:
|
||||
// TODO fix to be non public/remove from dbms....maybe
|
||||
system::System *system_;
|
||||
|
||||
#ifndef MG_ENTERPRISE
|
||||
mutable utils::Gatekeeper<Database> db_gatekeeper_; //!< Single databases gatekeeper
|
||||
|
@ -144,6 +144,8 @@ class Handler {
|
||||
auto cbegin() const { return items_.cbegin(); }
|
||||
auto cend() const { return items_.cend(); }
|
||||
|
||||
auto size() const { return items_.size(); }
|
||||
|
||||
struct string_hash {
|
||||
using is_transparent = void;
|
||||
[[nodiscard]] size_t operator()(const char *s) const { return std::hash<std::string_view>{}(s); }
|
||||
|
@ -155,6 +155,12 @@ void InMemoryReplicationHandlers::HeartbeatHandler(dbms::DbmsHandler *dbms_handl
|
||||
return;
|
||||
}
|
||||
// TODO: this handler is agnostic of InMemory, move to be reused by on-disk
|
||||
if (!db_acc.has_value()) {
|
||||
spdlog::warn("No database accessor");
|
||||
storage::replication::HeartbeatRes res{false, 0, ""};
|
||||
slk::Save(res, res_builder);
|
||||
return;
|
||||
}
|
||||
auto const *storage = db_acc->get()->storage();
|
||||
storage::replication::HeartbeatRes res{true, storage->repl_storage_state_.last_commit_timestamp_.load(),
|
||||
std::string{storage->repl_storage_state_.epoch_.id()}};
|
||||
@ -463,7 +469,6 @@ void InMemoryReplicationHandlers::TimestampHandler(dbms::DbmsHandler *dbms_handl
|
||||
slk::Save(res, res_builder);
|
||||
}
|
||||
|
||||
/////// AF how does this work, does it get all deltas at once or what?
|
||||
uint64_t InMemoryReplicationHandlers::ReadAndApplyDelta(storage::InMemoryStorage *storage,
|
||||
storage::durability::BaseDecoder *decoder,
|
||||
const uint64_t version) {
|
||||
|
@ -57,6 +57,7 @@ namespace slk {
|
||||
// Serialize code for CreateDatabaseReq
|
||||
|
||||
void Save(const memgraph::storage::replication::CreateDatabaseReq &self, memgraph::slk::Builder *builder) {
|
||||
memgraph::slk::Save(self.main_uuid, builder);
|
||||
memgraph::slk::Save(self.epoch_id, builder);
|
||||
memgraph::slk::Save(self.expected_group_timestamp, builder);
|
||||
memgraph::slk::Save(self.new_group_timestamp, builder);
|
||||
@ -64,6 +65,7 @@ void Save(const memgraph::storage::replication::CreateDatabaseReq &self, memgrap
|
||||
}
|
||||
|
||||
void Load(memgraph::storage::replication::CreateDatabaseReq *self, memgraph::slk::Reader *reader) {
|
||||
memgraph::slk::Load(&self->main_uuid, reader);
|
||||
memgraph::slk::Load(&self->epoch_id, reader);
|
||||
memgraph::slk::Load(&self->expected_group_timestamp, reader);
|
||||
memgraph::slk::Load(&self->new_group_timestamp, reader);
|
||||
@ -87,6 +89,7 @@ void Load(memgraph::storage::replication::CreateDatabaseRes *self, memgraph::slk
|
||||
// Serialize code for DropDatabaseReq
|
||||
|
||||
void Save(const memgraph::storage::replication::DropDatabaseReq &self, memgraph::slk::Builder *builder) {
|
||||
memgraph::slk::Save(self.main_uuid, builder);
|
||||
memgraph::slk::Save(self.epoch_id, builder);
|
||||
memgraph::slk::Save(self.expected_group_timestamp, builder);
|
||||
memgraph::slk::Save(self.new_group_timestamp, builder);
|
||||
@ -94,6 +97,7 @@ void Save(const memgraph::storage::replication::DropDatabaseReq &self, memgraph:
|
||||
}
|
||||
|
||||
void Load(memgraph::storage::replication::DropDatabaseReq *self, memgraph::slk::Reader *reader) {
|
||||
memgraph::slk::Load(&self->main_uuid, reader);
|
||||
memgraph::slk::Load(&self->epoch_id, reader);
|
||||
memgraph::slk::Load(&self->expected_group_timestamp, reader);
|
||||
memgraph::slk::Load(&self->new_group_timestamp, reader);
|
||||
|
@ -1,12 +1,17 @@
|
||||
add_library(mg-flags STATIC audit.cpp
|
||||
bolt.cpp
|
||||
general.cpp
|
||||
isolation_level.cpp
|
||||
log_level.cpp
|
||||
memory_limit.cpp
|
||||
run_time_configurable.cpp
|
||||
storage_mode.cpp
|
||||
query.cpp
|
||||
replication.cpp)
|
||||
target_include_directories(mg-flags PUBLIC ${CMAKE_SOURCE_DIR}/include)
|
||||
target_link_libraries(mg-flags PUBLIC spdlog::spdlog mg-settings mg-utils)
|
||||
add_library(mg-flags STATIC
|
||||
audit.cpp
|
||||
bolt.cpp
|
||||
general.cpp
|
||||
isolation_level.cpp
|
||||
log_level.cpp
|
||||
memory_limit.cpp
|
||||
run_time_configurable.cpp
|
||||
storage_mode.cpp
|
||||
query.cpp
|
||||
replication.cpp
|
||||
experimental.cpp
|
||||
experimental.hpp)
|
||||
target_include_directories(mg-flags PUBLIC include)
|
||||
target_link_libraries(mg-flags
|
||||
PUBLIC spdlog::spdlog mg-settings mg-utils
|
||||
PRIVATE lib::rangev3)
|
||||
|
@ -12,6 +12,7 @@
|
||||
|
||||
#include "flags/audit.hpp"
|
||||
#include "flags/bolt.hpp"
|
||||
#include "flags/experimental.hpp"
|
||||
#include "flags/general.hpp"
|
||||
#include "flags/isolation_level.hpp"
|
||||
#include "flags/log_level.hpp"
|
||||
|
67
src/flags/experimental.cpp
Normal file
67
src/flags/experimental.cpp
Normal file
@ -0,0 +1,67 @@
|
||||
// Copyright 2024 Memgraph Ltd.
|
||||
//
|
||||
// Use of this software is governed by the Business Source License
|
||||
// included in the file licenses/BSL.txt; by using this file, you agree to be bound by the terms of the Business Source
|
||||
// License, and you may not use this file except in compliance with the Business Source License.
|
||||
//
|
||||
// As of the Change Date specified in that file, in accordance with
|
||||
// the Business Source License, use of this software will be governed
|
||||
// by the Apache License, Version 2.0, included in the file
|
||||
// licenses/APL.txt.
|
||||
|
||||
#include "flags/experimental.hpp"
|
||||
#include "range/v3/all.hpp"
|
||||
#include "utils/string.hpp"
|
||||
|
||||
#include <map>
|
||||
#include <string_view>
|
||||
|
||||
// Bolt server flags.
|
||||
// NOLINTNEXTLINE (cppcoreguidelines-avoid-non-const-global-variables)
|
||||
DEFINE_string(experimental_enabled, "",
|
||||
"Experimental features to be used, comma seperated. Options [system-replication]");
|
||||
|
||||
using namespace std::string_view_literals;
|
||||
|
||||
namespace memgraph::flags {
|
||||
|
||||
auto const mapping = std::map{std::pair{"system-replication"sv, Experiments::SYSTEM_REPLICATION}};
|
||||
|
||||
auto ExperimentsInstance() -> Experiments & {
|
||||
static auto instance = Experiments{};
|
||||
return instance;
|
||||
}
|
||||
|
||||
bool AreExperimentsEnabled(Experiments experiments) {
|
||||
using t = std::underlying_type_t<Experiments>;
|
||||
|
||||
auto actual = static_cast<t>(ExperimentsInstance());
|
||||
auto check = static_cast<t>(experiments);
|
||||
|
||||
return (actual & check) == check;
|
||||
}
|
||||
|
||||
void InitializeExperimental() {
|
||||
namespace rv = ranges::views;
|
||||
|
||||
auto const connonicalize_string = [](auto &&rng) {
|
||||
auto const is_space = [](auto c) { return c == ' '; };
|
||||
auto const to_lower = [](unsigned char c) { return std::tolower(c); };
|
||||
|
||||
return rng | rv::drop_while(is_space) | rv::take_while(std::not_fn(is_space)) | rv::transform(to_lower) |
|
||||
ranges::to<std::string>;
|
||||
};
|
||||
|
||||
auto const mapping_end = mapping.cend();
|
||||
using underlying_type = std::underlying_type_t<Experiments>;
|
||||
auto to_set = underlying_type{};
|
||||
for (auto &&experiment : FLAGS_experimental_enabled | rv::split(',') | rv::transform(connonicalize_string)) {
|
||||
if (auto it = mapping.find(experiment); it != mapping_end) {
|
||||
to_set |= static_cast<underlying_type>(it->second);
|
||||
}
|
||||
}
|
||||
|
||||
ExperimentsInstance() = static_cast<Experiments>(to_set);
|
||||
}
|
||||
|
||||
} // namespace memgraph::flags
|
32
src/flags/experimental.hpp
Normal file
32
src/flags/experimental.hpp
Normal file
@ -0,0 +1,32 @@
|
||||
// Copyright 2024 Memgraph Ltd.
|
||||
//
|
||||
// Use of this software is governed by the Business Source License
|
||||
// included in the file licenses/BSL.txt; by using this file, you agree to be bound by the terms of the Business Source
|
||||
// License, and you may not use this file except in compliance with the Business Source License.
|
||||
//
|
||||
// As of the Change Date specified in that file, in accordance with
|
||||
// the Business Source License, use of this software will be governed
|
||||
// by the Apache License, Version 2.0, included in the file
|
||||
// licenses/APL.txt.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "gflags/gflags.h"
|
||||
|
||||
// Short help flag.
|
||||
// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
|
||||
DECLARE_string(experimental_enabled);
|
||||
|
||||
namespace memgraph::flags {
|
||||
|
||||
// Each bit is an enabled experiment
|
||||
// old experiments can be reused once code cleanup has happened
|
||||
enum class Experiments : uint8_t {
|
||||
SYSTEM_REPLICATION = 1 << 0,
|
||||
};
|
||||
|
||||
bool AreExperimentsEnabled(Experiments experiments);
|
||||
|
||||
void InitializeExperimental();
|
||||
|
||||
} // namespace memgraph::flags
|
@ -18,6 +18,12 @@ DEFINE_uint32(coordinator_server_port, 0, "Port on which coordinator servers wil
|
||||
DEFINE_uint32(raft_server_port, 0, "Port on which raft servers will be started.");
|
||||
// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
|
||||
DEFINE_uint32(raft_server_id, 0, "Unique ID of the raft server.");
|
||||
// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
|
||||
DEFINE_uint32(instance_down_timeout_sec, 5, "Time duration after which an instance is considered down.");
|
||||
// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
|
||||
DEFINE_uint32(instance_health_check_frequency_sec, 1, "The time duration between two health checks/pings.");
|
||||
// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
|
||||
DEFINE_uint32(instance_get_uuid_frequency_sec, 10, "The time duration between two instance uuid checks.");
|
||||
#endif
|
||||
|
||||
// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
|
||||
|
@ -20,6 +20,12 @@ DECLARE_uint32(coordinator_server_port);
|
||||
DECLARE_uint32(raft_server_port);
|
||||
// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
|
||||
DECLARE_uint32(raft_server_id);
|
||||
// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
|
||||
DECLARE_uint32(instance_down_timeout_sec);
|
||||
// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
|
||||
DECLARE_uint32(instance_health_check_frequency_sec);
|
||||
// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
|
||||
DECLARE_uint32(instance_get_uuid_frequency_sec);
|
||||
#endif
|
||||
|
||||
// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
|
||||
|
@ -134,6 +134,7 @@ int main(int argc, char **argv) {
|
||||
}
|
||||
|
||||
memgraph::flags::InitializeLogger();
|
||||
memgraph::flags::InitializeExperimental();
|
||||
|
||||
// Unhandled exception handler init.
|
||||
std::set_terminate(&memgraph::utils::TerminateHandler);
|
||||
@ -355,6 +356,11 @@ int main(int argc, char **argv) {
|
||||
memgraph::query::InterpreterConfig interp_config{
|
||||
.query = {.allow_load_csv = FLAGS_allow_load_csv},
|
||||
.replication_replica_check_frequency = std::chrono::seconds(FLAGS_replication_replica_check_frequency_sec),
|
||||
#ifdef MG_ENTERPRISE
|
||||
.instance_down_timeout_sec = std::chrono::seconds(FLAGS_instance_down_timeout_sec),
|
||||
.instance_health_check_frequency_sec = std::chrono::seconds(FLAGS_instance_health_check_frequency_sec),
|
||||
.instance_get_uuid_frequency_sec = std::chrono::seconds(FLAGS_instance_get_uuid_frequency_sec),
|
||||
#endif
|
||||
.default_kafka_bootstrap_servers = FLAGS_kafka_bootstrap_servers,
|
||||
.default_pulsar_service_url = FLAGS_pulsar_service_url,
|
||||
.stream_transaction_conflict_retries = FLAGS_stream_transaction_conflict_retries,
|
||||
@ -396,7 +402,7 @@ int main(int argc, char **argv) {
|
||||
memgraph::coordination::CoordinatorState coordinator_state;
|
||||
#endif
|
||||
|
||||
memgraph::dbms::DbmsHandler dbms_handler(db_config, system, repl_state
|
||||
memgraph::dbms::DbmsHandler dbms_handler(db_config, repl_state
|
||||
#ifdef MG_ENTERPRISE
|
||||
,
|
||||
auth_, FLAGS_data_recovery_on_startup
|
||||
@ -409,7 +415,7 @@ int main(int argc, char **argv) {
|
||||
auto replication_handler = memgraph::replication::ReplicationHandler{repl_state, dbms_handler
|
||||
#ifdef MG_ENTERPRISE
|
||||
,
|
||||
&system, auth_
|
||||
system, auth_
|
||||
#endif
|
||||
};
|
||||
|
||||
|
@ -1,4 +1,4 @@
|
||||
// Copyright 2023 Memgraph Ltd.
|
||||
// Copyright 2024 Memgraph Ltd.
|
||||
//
|
||||
// Use of this software is governed by the Business Source License
|
||||
// included in the file licenses/BSL.txt; by using this file, you agree to be bound by the terms of the Business Source
|
||||
@ -60,10 +60,13 @@ void *my_alloc(extent_hooks_t *extent_hooks, void *new_addr, size_t size, size_t
|
||||
unsigned arena_ind) {
|
||||
// This needs to be before, to throw exception in case of too big alloc
|
||||
if (*commit) [[likely]] {
|
||||
memgraph::utils::total_memory_tracker.Alloc(static_cast<int64_t>(size));
|
||||
if (GetQueriesMemoryControl().IsThreadTracked()) [[unlikely]] {
|
||||
GetQueriesMemoryControl().TrackAllocOnCurrentThread(size);
|
||||
bool ok = GetQueriesMemoryControl().TrackAllocOnCurrentThread(size);
|
||||
if (!ok) return nullptr;
|
||||
}
|
||||
// This needs to be here so it doesn't get incremented in case the first TrackAlloc throws an exception
|
||||
bool ok = memgraph::utils::total_memory_tracker.Alloc(static_cast<int64_t>(size));
|
||||
if (!ok) return nullptr;
|
||||
}
|
||||
|
||||
auto *ptr = old_hooks->alloc(extent_hooks, new_addr, size, alignment, zero, commit, arena_ind);
|
||||
@ -117,11 +120,15 @@ static bool my_commit(extent_hooks_t *extent_hooks, void *addr, size_t size, siz
|
||||
return err;
|
||||
}
|
||||
|
||||
memgraph::utils::total_memory_tracker.Alloc(static_cast<int64_t>(length));
|
||||
[[maybe_unused]] auto blocker = memgraph::utils::MemoryTracker::OutOfMemoryExceptionBlocker{};
|
||||
if (GetQueriesMemoryControl().IsThreadTracked()) [[unlikely]] {
|
||||
GetQueriesMemoryControl().TrackAllocOnCurrentThread(size);
|
||||
bool ok = GetQueriesMemoryControl().TrackAllocOnCurrentThread(length);
|
||||
DMG_ASSERT(ok);
|
||||
}
|
||||
|
||||
auto ok = memgraph::utils::total_memory_tracker.Alloc(static_cast<int64_t>(length));
|
||||
DMG_ASSERT(ok);
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
|
@ -1,4 +1,4 @@
|
||||
// Copyright 2023 Memgraph Ltd.
|
||||
// Copyright 2024 Memgraph Ltd.
|
||||
//
|
||||
// Use of this software is governed by the Business Source License
|
||||
// included in the file licenses/BSL.txt; by using this file, you agree to be bound by the terms of the Business Source
|
||||
@ -28,6 +28,12 @@ void *newImpl(const std::size_t size) {
|
||||
return ptr;
|
||||
}
|
||||
|
||||
[[maybe_unused]] auto blocker = memgraph::utils::MemoryTracker::OutOfMemoryExceptionBlocker{};
|
||||
auto maybe_msg = memgraph::utils::MemoryErrorStatus().msg();
|
||||
if (maybe_msg) {
|
||||
throw memgraph::utils::OutOfMemoryException{std::move(*maybe_msg)};
|
||||
}
|
||||
|
||||
throw std::bad_alloc{};
|
||||
}
|
||||
|
||||
@ -37,11 +43,21 @@ void *newImpl(const std::size_t size, const std::align_val_t align) {
|
||||
return ptr;
|
||||
}
|
||||
|
||||
[[maybe_unused]] auto blocker = memgraph::utils::MemoryTracker::OutOfMemoryExceptionBlocker{};
|
||||
auto maybe_msg = memgraph::utils::MemoryErrorStatus().msg();
|
||||
if (maybe_msg) {
|
||||
throw memgraph::utils::OutOfMemoryException{std::move(*maybe_msg)};
|
||||
}
|
||||
|
||||
throw std::bad_alloc{};
|
||||
}
|
||||
|
||||
void *newNoExcept(const std::size_t size) noexcept { return malloc(size); }
|
||||
void *newNoExcept(const std::size_t size) noexcept {
|
||||
[[maybe_unused]] auto blocker = memgraph::utils::MemoryTracker::OutOfMemoryExceptionBlocker{};
|
||||
return malloc(size);
|
||||
}
|
||||
void *newNoExcept(const std::size_t size, const std::align_val_t align) noexcept {
|
||||
[[maybe_unused]] auto blocker = memgraph::utils::MemoryTracker::OutOfMemoryExceptionBlocker{};
|
||||
return aligned_alloc(size, static_cast<std::size_t>(align));
|
||||
}
|
||||
|
||||
|
@ -54,14 +54,14 @@ void QueriesMemoryControl::EraseThreadToTransactionId(const std::thread::id &thr
|
||||
}
|
||||
}
|
||||
|
||||
void QueriesMemoryControl::TrackAllocOnCurrentThread(size_t size) {
|
||||
bool QueriesMemoryControl::TrackAllocOnCurrentThread(size_t size) {
|
||||
auto thread_id_to_transaction_id_accessor = thread_id_to_transaction_id.access();
|
||||
|
||||
// we might be just constructing mapping between thread id and transaction id
|
||||
// so we miss this allocation
|
||||
auto thread_id_to_transaction_id_elem = thread_id_to_transaction_id_accessor.find(std::this_thread::get_id());
|
||||
if (thread_id_to_transaction_id_elem == thread_id_to_transaction_id_accessor.end()) {
|
||||
return;
|
||||
return true;
|
||||
}
|
||||
|
||||
auto transaction_id_to_tracker_accessor = transaction_id_to_tracker.access();
|
||||
@ -71,10 +71,10 @@ void QueriesMemoryControl::TrackAllocOnCurrentThread(size_t size) {
|
||||
// It can happen that some allocation happens between mapping thread to
|
||||
// transaction id, so we miss this allocation
|
||||
if (transaction_id_to_tracker == transaction_id_to_tracker_accessor.end()) [[unlikely]] {
|
||||
return;
|
||||
return true;
|
||||
}
|
||||
auto &query_tracker = transaction_id_to_tracker->tracker;
|
||||
query_tracker.TrackAlloc(size);
|
||||
return query_tracker.TrackAlloc(size);
|
||||
}
|
||||
|
||||
void QueriesMemoryControl::TrackFreeOnCurrentThread(size_t size) {
|
||||
@ -110,7 +110,7 @@ void QueriesMemoryControl::CreateTransactionIdTracker(uint64_t transaction_id, s
|
||||
|
||||
bool QueriesMemoryControl::EraseTransactionIdTracker(uint64_t transaction_id) {
|
||||
auto transaction_id_to_tracker_accessor = transaction_id_to_tracker.access();
|
||||
auto removed = transaction_id_to_tracker.access().remove(transaction_id);
|
||||
auto removed = transaction_id_to_tracker_accessor.remove(transaction_id);
|
||||
return removed;
|
||||
}
|
||||
|
||||
|
@ -62,7 +62,7 @@ class QueriesMemoryControl {
|
||||
// Find tracker for current thread if exists, track
|
||||
// query allocation and procedure allocation if
|
||||
// necessary
|
||||
void TrackAllocOnCurrentThread(size_t size);
|
||||
bool TrackAllocOnCurrentThread(size_t size);
|
||||
|
||||
// Find tracker for current thread if exists, track
|
||||
// query allocation and procedure allocation if
|
||||
|
@ -22,6 +22,10 @@ struct InterpreterConfig {
|
||||
// The same as \ref memgraph::replication::ReplicationClientConfig
|
||||
std::chrono::seconds replication_replica_check_frequency{1};
|
||||
|
||||
std::chrono::seconds instance_down_timeout_sec{5};
|
||||
std::chrono::seconds instance_health_check_frequency_sec{1};
|
||||
std::chrono::seconds instance_get_uuid_frequency_sec{10};
|
||||
|
||||
std::string default_kafka_bootstrap_servers;
|
||||
std::string default_pulsar_service_url;
|
||||
uint32_t stream_transaction_conflict_retries;
|
||||
|
@ -3065,7 +3065,7 @@ class ReplicationQuery : public memgraph::query::Query {
|
||||
|
||||
enum class SyncMode { SYNC, ASYNC };
|
||||
|
||||
enum class ReplicaState { READY, REPLICATING, RECOVERY, MAYBE_BEHIND };
|
||||
enum class ReplicaState { READY, REPLICATING, RECOVERY, MAYBE_BEHIND, DIVERGED_FROM_MAIN };
|
||||
|
||||
ReplicationQuery() = default;
|
||||
|
||||
@ -3102,7 +3102,13 @@ class CoordinatorQuery : public memgraph::query::Query {
|
||||
static const utils::TypeInfo kType;
|
||||
const utils::TypeInfo &GetTypeInfo() const override { return kType; }
|
||||
|
||||
enum class Action { REGISTER_INSTANCE, SET_INSTANCE_TO_MAIN, SHOW_INSTANCES, ADD_COORDINATOR_INSTANCE };
|
||||
enum class Action {
|
||||
REGISTER_INSTANCE,
|
||||
UNREGISTER_INSTANCE,
|
||||
SET_INSTANCE_TO_MAIN,
|
||||
SHOW_INSTANCES,
|
||||
ADD_COORDINATOR_INSTANCE
|
||||
};
|
||||
|
||||
enum class SyncMode { SYNC, ASYNC };
|
||||
|
||||
|
@ -421,6 +421,14 @@ antlrcpp::Any CypherMainVisitor::visitRegisterInstanceOnCoordinator(
|
||||
return coordinator_query;
|
||||
}
|
||||
|
||||
antlrcpp::Any CypherMainVisitor::visitUnregisterInstanceOnCoordinator(
|
||||
MemgraphCypher::UnregisterInstanceOnCoordinatorContext *ctx) {
|
||||
auto *coordinator_query = storage_->Create<CoordinatorQuery>();
|
||||
coordinator_query->action_ = CoordinatorQuery::Action::UNREGISTER_INSTANCE;
|
||||
coordinator_query->instance_name_ = std::any_cast<std::string>(ctx->instanceName()->symbolicName()->accept(this));
|
||||
return coordinator_query;
|
||||
}
|
||||
|
||||
antlrcpp::Any CypherMainVisitor::visitAddCoordinatorInstance(MemgraphCypher::AddCoordinatorInstanceContext *ctx) {
|
||||
auto *coordinator_query = storage_->Create<CoordinatorQuery>();
|
||||
|
||||
|
@ -248,6 +248,12 @@ class CypherMainVisitor : public antlropencypher::MemgraphCypherBaseVisitor {
|
||||
*/
|
||||
antlrcpp::Any visitRegisterInstanceOnCoordinator(MemgraphCypher::RegisterInstanceOnCoordinatorContext *ctx) override;
|
||||
|
||||
/**
|
||||
* @return CoordinatorQuery*
|
||||
*/
|
||||
antlrcpp::Any visitUnregisterInstanceOnCoordinator(
|
||||
MemgraphCypher::UnregisterInstanceOnCoordinatorContext *ctx) override;
|
||||
|
||||
/**
|
||||
* @return CoordinatorQuery*
|
||||
*/
|
||||
|
@ -191,6 +191,7 @@ replicationQuery : setReplicationRole
|
||||
;
|
||||
|
||||
coordinatorQuery : registerInstanceOnCoordinator
|
||||
| unregisterInstanceOnCoordinator
|
||||
| setInstanceToMain
|
||||
| showInstances
|
||||
| addCoordinatorInstance
|
||||
@ -393,6 +394,8 @@ registerReplica : REGISTER REPLICA instanceName ( SYNC | ASYNC )
|
||||
|
||||
registerInstanceOnCoordinator : REGISTER INSTANCE instanceName ON coordinatorSocketAddress ( AS ASYNC ) ? WITH replicationSocketAddress ;
|
||||
|
||||
unregisterInstanceOnCoordinator : UNREGISTER INSTANCE instanceName ;
|
||||
|
||||
setInstanceToMain : SET INSTANCE instanceName TO MAIN ;
|
||||
|
||||
raftServerId : literal ;
|
||||
|
@ -141,6 +141,7 @@ TRIGGER : T R I G G E R ;
|
||||
TRIGGERS : T R I G G E R S ;
|
||||
UNCOMMITTED : U N C O M M I T T E D ;
|
||||
UNLOCK : U N L O C K ;
|
||||
UNREGISTER : U N R E G I S T E R ;
|
||||
UPDATE : U P D A T E ;
|
||||
USE : U S E ;
|
||||
USER : U S E R ;
|
||||
|
@ -1,4 +1,4 @@
|
||||
// Copyright 2023 Memgraph Ltd.
|
||||
// Copyright 2024 Memgraph Ltd.
|
||||
//
|
||||
// Use of this software is governed by the Business Source License
|
||||
// included in the file licenses/BSL.txt; by using this file, you agree to be bound by the terms of the Business Source
|
||||
@ -394,9 +394,16 @@ SymbolGenerator::ReturnType SymbolGenerator::Visit(Identifier &ident) {
|
||||
// can reference symbols bound later in the same MATCH. We collect them
|
||||
// here, so that they can be checked after visiting Match.
|
||||
scope.identifiers_in_match.emplace_back(&ident);
|
||||
} else if (scope.in_call_subquery && !scope.in_with) {
|
||||
if (!scope.symbols.contains(ident.name_) && !ConsumePredefinedIdentifier(ident.name_)) {
|
||||
throw UnboundVariableError(ident.name_);
|
||||
}
|
||||
symbol = GetOrCreateSymbol(ident.name_, ident.user_declared_, Symbol::Type::ANY);
|
||||
} else {
|
||||
// Everything else references a bound symbol.
|
||||
if (!HasSymbol(ident.name_) && !ConsumePredefinedIdentifier(ident.name_)) throw UnboundVariableError(ident.name_);
|
||||
if (!HasSymbol(ident.name_) && !ConsumePredefinedIdentifier(ident.name_)) {
|
||||
throw UnboundVariableError(ident.name_);
|
||||
}
|
||||
symbol = GetOrCreateSymbol(ident.name_, ident.user_declared_, Symbol::Type::ANY);
|
||||
}
|
||||
ident.MapTo(symbol);
|
||||
|
@ -93,6 +93,7 @@
|
||||
#include "utils/exceptions.hpp"
|
||||
#include "utils/file.hpp"
|
||||
#include "utils/flag_validation.hpp"
|
||||
#include "utils/functional.hpp"
|
||||
#include "utils/likely.hpp"
|
||||
#include "utils/logging.hpp"
|
||||
#include "utils/memory.hpp"
|
||||
@ -109,6 +110,7 @@
|
||||
|
||||
#ifdef MG_ENTERPRISE
|
||||
#include "coordination/constants.hpp"
|
||||
#include "flags/experimental.hpp"
|
||||
#endif
|
||||
|
||||
namespace memgraph::metrics {
|
||||
@ -327,7 +329,7 @@ class ReplQueryHandler {
|
||||
.port = static_cast<uint16_t>(*port),
|
||||
};
|
||||
|
||||
if (!handler_->SetReplicationRoleReplica(config, std::nullopt)) {
|
||||
if (!handler_->TrySetReplicationRoleReplica(config, std::nullopt)) {
|
||||
throw QueryRuntimeException("Couldn't set role to replica!");
|
||||
}
|
||||
}
|
||||
@ -437,6 +439,9 @@ class ReplQueryHandler {
|
||||
case storage::replication::ReplicaState::MAYBE_BEHIND:
|
||||
replica.state = ReplicationQuery::ReplicaState::MAYBE_BEHIND;
|
||||
break;
|
||||
case storage::replication::ReplicaState::DIVERGED_FROM_MAIN:
|
||||
replica.state = ReplicationQuery::ReplicaState::DIVERGED_FROM_MAIN;
|
||||
break;
|
||||
}
|
||||
|
||||
return replica;
|
||||
@ -457,10 +462,33 @@ class CoordQueryHandler final : public query::CoordinatorQueryHandler {
|
||||
|
||||
: coordinator_handler_(coordinator_state) {}
|
||||
|
||||
/// @throw QueryRuntimeException if an error ocurred.
|
||||
void RegisterInstance(const std::string &coordinator_socket_address, const std::string &replication_socket_address,
|
||||
const std::chrono::seconds instance_check_frequency, const std::string &instance_name,
|
||||
CoordinatorQuery::SyncMode sync_mode) override {
|
||||
void UnregisterInstance(std::string const &instance_name) override {
|
||||
auto status = coordinator_handler_.UnregisterReplicationInstance(instance_name);
|
||||
switch (status) {
|
||||
using enum memgraph::coordination::UnregisterInstanceCoordinatorStatus;
|
||||
case NO_INSTANCE_WITH_NAME:
|
||||
throw QueryRuntimeException("No instance with such name!");
|
||||
case IS_MAIN:
|
||||
throw QueryRuntimeException(
|
||||
"Alive main instance can't be unregistered! Shut it down to trigger failover and then unregister it!");
|
||||
case NOT_COORDINATOR:
|
||||
throw QueryRuntimeException("UNREGISTER INSTANCE query can only be run on a coordinator!");
|
||||
case NOT_LEADER:
|
||||
throw QueryRuntimeException("Couldn't unregister replica instance since coordinator is not a leader!");
|
||||
case RPC_FAILED:
|
||||
throw QueryRuntimeException(
|
||||
"Couldn't unregister replica instance because current main instance couldn't unregister replica!");
|
||||
case SUCCESS:
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
void RegisterReplicationInstance(std::string const &coordinator_socket_address,
|
||||
std::string const &replication_socket_address,
|
||||
std::chrono::seconds const &instance_check_frequency,
|
||||
std::chrono::seconds const &instance_down_timeout,
|
||||
std::chrono::seconds const &instance_get_uuid_frequency,
|
||||
std::string const &instance_name, CoordinatorQuery::SyncMode sync_mode) override {
|
||||
const auto maybe_replication_ip_port =
|
||||
io::network::Endpoint::ParseSocketOrAddress(replication_socket_address, std::nullopt);
|
||||
if (!maybe_replication_ip_port) {
|
||||
@ -485,11 +513,13 @@ class CoordQueryHandler final : public query::CoordinatorQueryHandler {
|
||||
coordination::CoordinatorClientConfig{.instance_name = instance_name,
|
||||
.ip_address = coordinator_server_ip,
|
||||
.port = coordinator_server_port,
|
||||
.health_check_frequency_sec = instance_check_frequency,
|
||||
.instance_health_check_frequency_sec = instance_check_frequency,
|
||||
.instance_down_timeout_sec = instance_down_timeout,
|
||||
.instance_get_uuid_frequency_sec = instance_get_uuid_frequency,
|
||||
.replication_client_info = repl_config,
|
||||
.ssl = std::nullopt};
|
||||
|
||||
auto status = coordinator_handler_.RegisterInstance(coordinator_client_config);
|
||||
auto status = coordinator_handler_.RegisterReplicationInstance(coordinator_client_config);
|
||||
switch (status) {
|
||||
using enum memgraph::coordination::RegisterInstanceCoordinatorStatus;
|
||||
case NAME_EXISTS:
|
||||
@ -499,6 +529,14 @@ class CoordQueryHandler final : public query::CoordinatorQueryHandler {
|
||||
"Couldn't register replica instance since instance with such endpoint already exists!");
|
||||
case NOT_COORDINATOR:
|
||||
throw QueryRuntimeException("REGISTER INSTANCE query can only be run on a coordinator!");
|
||||
case NOT_LEADER:
|
||||
throw QueryRuntimeException("Couldn't register replica instance since coordinator is not a leader!");
|
||||
case RAFT_COULD_NOT_ACCEPT:
|
||||
throw QueryRuntimeException(
|
||||
"Couldn't register replica instance since raft server couldn't accept the log! Most likely the raft "
|
||||
"instance is not a leader!");
|
||||
case RAFT_COULD_NOT_APPEND:
|
||||
throw QueryRuntimeException("Couldn't register replica instance since raft server couldn't append the log!");
|
||||
case RPC_FAILED:
|
||||
throw QueryRuntimeException(
|
||||
"Couldn't register replica instance because setting instance to replica failed! Check logs on replica to "
|
||||
@ -519,12 +557,14 @@ class CoordQueryHandler final : public query::CoordinatorQueryHandler {
|
||||
}
|
||||
}
|
||||
|
||||
void SetInstanceToMain(const std::string &instance_name) override {
|
||||
auto status = coordinator_handler_.SetInstanceToMain(instance_name);
|
||||
void SetReplicationInstanceToMain(const std::string &instance_name) override {
|
||||
auto status = coordinator_handler_.SetReplicationInstanceToMain(instance_name);
|
||||
switch (status) {
|
||||
using enum memgraph::coordination::SetInstanceToMainCoordinatorStatus;
|
||||
case NO_INSTANCE_WITH_NAME:
|
||||
throw QueryRuntimeException("No instance with such name!");
|
||||
case MAIN_ALREADY_EXISTS:
|
||||
throw QueryRuntimeException("Couldn't set instance to main since there is already a main instance in cluster!");
|
||||
case NOT_COORDINATOR:
|
||||
throw QueryRuntimeException("SET INSTANCE TO MAIN query can only be run on a coordinator!");
|
||||
case COULD_NOT_PROMOTE_TO_MAIN:
|
||||
@ -1073,6 +1113,9 @@ Callback HandleReplicationQuery(ReplicationQuery *repl_query, const Parameters &
|
||||
case ReplicationQuery::ReplicaState::MAYBE_BEHIND:
|
||||
typed_replica.emplace_back("invalid");
|
||||
break;
|
||||
case ReplicationQuery::ReplicaState::DIVERGED_FROM_MAIN:
|
||||
typed_replica.emplace_back("diverged");
|
||||
break;
|
||||
}
|
||||
|
||||
typed_replicas.emplace_back(std::move(typed_replica));
|
||||
@ -1142,12 +1185,16 @@ Callback HandleCoordinatorQuery(CoordinatorQuery *coordinator_query, const Param
|
||||
auto coordinator_socket_address_tv = coordinator_query->coordinator_socket_address_->Accept(evaluator);
|
||||
auto replication_socket_address_tv = coordinator_query->replication_socket_address_->Accept(evaluator);
|
||||
callback.fn = [handler = CoordQueryHandler{*coordinator_state}, coordinator_socket_address_tv,
|
||||
replication_socket_address_tv, main_check_frequency = config.replication_replica_check_frequency,
|
||||
replication_socket_address_tv,
|
||||
instance_health_check_frequency_sec = config.instance_health_check_frequency_sec,
|
||||
instance_name = coordinator_query->instance_name_,
|
||||
instance_down_timeout_sec = config.instance_down_timeout_sec,
|
||||
instance_get_uuid_frequency_sec = config.instance_get_uuid_frequency_sec,
|
||||
sync_mode = coordinator_query->sync_mode_]() mutable {
|
||||
handler.RegisterInstance(std::string(coordinator_socket_address_tv.ValueString()),
|
||||
std::string(replication_socket_address_tv.ValueString()), main_check_frequency,
|
||||
instance_name, sync_mode);
|
||||
handler.RegisterReplicationInstance(std::string(coordinator_socket_address_tv.ValueString()),
|
||||
std::string(replication_socket_address_tv.ValueString()),
|
||||
instance_health_check_frequency_sec, instance_down_timeout_sec,
|
||||
instance_get_uuid_frequency_sec, instance_name, sync_mode);
|
||||
return std::vector<std::vector<TypedValue>>();
|
||||
};
|
||||
|
||||
@ -1157,6 +1204,30 @@ Callback HandleCoordinatorQuery(CoordinatorQuery *coordinator_query, const Param
|
||||
coordinator_socket_address_tv.ValueString(), coordinator_query->instance_name_));
|
||||
return callback;
|
||||
}
|
||||
case CoordinatorQuery::Action::UNREGISTER_INSTANCE:
|
||||
if (!license::global_license_checker.IsEnterpriseValidFast()) {
|
||||
throw QueryException("Trying to use enterprise feature without a valid license.");
|
||||
}
|
||||
|
||||
if constexpr (!coordination::allow_ha) {
|
||||
throw QueryRuntimeException(
|
||||
"High availability is experimental feature. Please set MG_EXPERIMENTAL_HIGH_AVAILABILITY compile flag to "
|
||||
"be able to use this functionality.");
|
||||
}
|
||||
if (!FLAGS_raft_server_id) {
|
||||
throw QueryRuntimeException("Only coordinator can register coordinator server!");
|
||||
}
|
||||
callback.fn = [handler = CoordQueryHandler{*coordinator_state},
|
||||
instance_name = coordinator_query->instance_name_]() mutable {
|
||||
handler.UnregisterInstance(instance_name);
|
||||
return std::vector<std::vector<TypedValue>>();
|
||||
};
|
||||
notifications->emplace_back(
|
||||
SeverityLevel::INFO, NotificationCode::UNREGISTER_INSTANCE,
|
||||
fmt::format("Coordinator has unregistered instance {}.", coordinator_query->instance_name_));
|
||||
|
||||
return callback;
|
||||
|
||||
case CoordinatorQuery::Action::SET_INSTANCE_TO_MAIN: {
|
||||
if (!license::global_license_checker.IsEnterpriseValidFast()) {
|
||||
throw QueryException("Trying to use enterprise feature without a valid license.");
|
||||
@ -1176,7 +1247,7 @@ Callback HandleCoordinatorQuery(CoordinatorQuery *coordinator_query, const Param
|
||||
|
||||
callback.fn = [handler = CoordQueryHandler{*coordinator_state},
|
||||
instance_name = coordinator_query->instance_name_]() mutable {
|
||||
handler.SetInstanceToMain(instance_name);
|
||||
handler.SetReplicationInstanceToMain(instance_name);
|
||||
return std::vector<std::vector<TypedValue>>();
|
||||
};
|
||||
|
||||
@ -1199,17 +1270,13 @@ Callback HandleCoordinatorQuery(CoordinatorQuery *coordinator_query, const Param
|
||||
callback.fn = [handler = CoordQueryHandler{*coordinator_state},
|
||||
replica_nfields = callback.header.size()]() mutable {
|
||||
auto const instances = handler.ShowInstances();
|
||||
std::vector<std::vector<TypedValue>> result{};
|
||||
result.reserve(result.size());
|
||||
auto const converter = [](const auto &status) -> std::vector<TypedValue> {
|
||||
return {TypedValue{status.instance_name}, TypedValue{status.raft_socket_address},
|
||||
TypedValue{status.coord_socket_address}, TypedValue{status.is_alive},
|
||||
TypedValue{status.cluster_role}};
|
||||
};
|
||||
|
||||
std::ranges::transform(instances, std::back_inserter(result),
|
||||
[](const auto &status) -> std::vector<TypedValue> {
|
||||
return {TypedValue{status.instance_name}, TypedValue{status.raft_socket_address},
|
||||
TypedValue{status.coord_socket_address}, TypedValue{status.is_alive},
|
||||
TypedValue{status.cluster_role}};
|
||||
});
|
||||
|
||||
return result;
|
||||
return utils::fmap(converter, instances);
|
||||
};
|
||||
return callback;
|
||||
}
|
||||
@ -3946,7 +4013,9 @@ PreparedQuery PrepareMultiDatabaseQuery(ParsedQuery parsed_query, CurrentDB &cur
|
||||
if (current_db.in_explicit_db_) {
|
||||
throw QueryException("Database switching is prohibited if session explicitly defines the used database");
|
||||
}
|
||||
if (!dbms::allow_mt_repl && is_replica) {
|
||||
|
||||
using enum memgraph::flags::Experiments;
|
||||
if (!flags::AreExperimentsEnabled(SYSTEM_REPLICATION) && is_replica) {
|
||||
throw QueryException("Query forbidden on the replica!");
|
||||
}
|
||||
return PreparedQuery{{"STATUS"},
|
||||
@ -4415,9 +4484,19 @@ Interpreter::PrepareResult Interpreter::Prepare(const std::string &query_string,
|
||||
|
||||
UpdateTypeCount(rw_type);
|
||||
|
||||
if (interpreter_context_->repl_state->IsReplica() && IsQueryWrite(rw_type)) {
|
||||
query_execution = nullptr;
|
||||
throw QueryException("Write query forbidden on the replica!");
|
||||
bool const write_query = IsQueryWrite(rw_type);
|
||||
if (write_query) {
|
||||
if (interpreter_context_->repl_state->IsReplica()) {
|
||||
query_execution = nullptr;
|
||||
throw QueryException("Write query forbidden on the replica!");
|
||||
}
|
||||
#ifdef MG_ENTERPRISE
|
||||
if (FLAGS_coordinator_server_port && !interpreter_context_->repl_state->IsMainWriteable()) {
|
||||
query_execution = nullptr;
|
||||
throw QueryException(
|
||||
"Write query forbidden on the main! Coordinator needs to enable writing on main by sending RPC message.");
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
// Set the target db to the current db (some queries have different target from the current db)
|
||||
@ -4604,7 +4683,8 @@ void Interpreter::Commit() {
|
||||
auto const main_commit = [&](replication::RoleMainData &mainData) {
|
||||
// Only enterprise can do system replication
|
||||
#ifdef MG_ENTERPRISE
|
||||
if (license::global_license_checker.IsEnterpriseValidFast()) {
|
||||
using enum memgraph::flags::Experiments;
|
||||
if (flags::AreExperimentsEnabled(SYSTEM_REPLICATION) && license::global_license_checker.IsEnterpriseValidFast()) {
|
||||
return system_transaction_->Commit(memgraph::system::DoReplication{mainData});
|
||||
}
|
||||
#endif
|
||||
|
@ -105,13 +105,18 @@ class CoordinatorQueryHandler {
|
||||
};
|
||||
|
||||
/// @throw QueryRuntimeException if an error ocurred.
|
||||
virtual void RegisterInstance(const std::string &coordinator_socket_address,
|
||||
const std::string &replication_socket_address,
|
||||
const std::chrono::seconds instance_check_frequency, const std::string &instance_name,
|
||||
CoordinatorQuery::SyncMode sync_mode) = 0;
|
||||
virtual void RegisterReplicationInstance(std::string const &coordinator_socket_address,
|
||||
std::string const &replication_socket_address,
|
||||
std::chrono::seconds const &instance_health_check_frequency,
|
||||
std::chrono::seconds const &instance_down_timeout,
|
||||
std::chrono::seconds const &instance_get_uuid_frequency,
|
||||
std::string const &instance_name, CoordinatorQuery::SyncMode sync_mode) = 0;
|
||||
|
||||
/// @throw QueryRuntimeException if an error ocurred.
|
||||
virtual void SetInstanceToMain(const std::string &instance_name) = 0;
|
||||
virtual void UnregisterInstance(std::string const &instance_name) = 0;
|
||||
|
||||
/// @throw QueryRuntimeException if an error ocurred.
|
||||
virtual void SetReplicationInstanceToMain(const std::string &instance_name) = 0;
|
||||
|
||||
/// @throw QueryRuntimeException if an error ocurred.
|
||||
virtual std::vector<coordination::InstanceStatus> ShowInstances() const = 0;
|
||||
|
@ -71,6 +71,8 @@ constexpr std::string_view GetCodeString(const NotificationCode code) {
|
||||
return "RegisterCoordinatorServer"sv;
|
||||
case NotificationCode::ADD_COORDINATOR_INSTANCE:
|
||||
return "AddCoordinatorInstance"sv;
|
||||
case NotificationCode::UNREGISTER_INSTANCE:
|
||||
return "UnregisterInstance"sv;
|
||||
#endif
|
||||
case NotificationCode::REPLICA_PORT_WARNING:
|
||||
return "ReplicaPortWarning"sv;
|
||||
|
@ -43,8 +43,9 @@ enum class NotificationCode : uint8_t {
|
||||
REPLICA_PORT_WARNING,
|
||||
REGISTER_REPLICA,
|
||||
#ifdef MG_ENTERPRISE
|
||||
REGISTER_COORDINATOR_SERVER,
|
||||
REGISTER_COORDINATOR_SERVER, // TODO: (andi) What is this?
|
||||
ADD_COORDINATOR_INSTANCE,
|
||||
UNREGISTER_INSTANCE,
|
||||
#endif
|
||||
SET_REPLICA,
|
||||
START_STREAM,
|
||||
|
@ -188,6 +188,7 @@ template <typename TFunc, typename... Args>
|
||||
spdlog::error("Memory allocation error during mg API call: {}", bae.what());
|
||||
return mgp_error::MGP_ERROR_UNABLE_TO_ALLOCATE;
|
||||
} catch (const memgraph::utils::OutOfMemoryException &oome) {
|
||||
[[maybe_unused]] auto blocker = memgraph::utils::MemoryTracker::OutOfMemoryExceptionBlocker{};
|
||||
spdlog::error("Memory limit exceeded during mg API call: {}", oome.what());
|
||||
return mgp_error::MGP_ERROR_UNABLE_TO_ALLOCATE;
|
||||
} catch (const std::out_of_range &oore) {
|
||||
@ -199,12 +200,12 @@ template <typename TFunc, typename... Args>
|
||||
} catch (const std::logic_error &lee) {
|
||||
spdlog::error("Logic error during mg API call: {}", lee.what());
|
||||
return mgp_error::MGP_ERROR_LOGIC_ERROR;
|
||||
} catch (const std::exception &e) {
|
||||
spdlog::error("Unexpected error during mg API call: {}", e.what());
|
||||
return mgp_error::MGP_ERROR_UNKNOWN_ERROR;
|
||||
} catch (const memgraph::utils::temporal::InvalidArgumentException &e) {
|
||||
spdlog::error("Invalid argument was sent to an mg API call for temporal types: {}", e.what());
|
||||
return mgp_error::MGP_ERROR_INVALID_ARGUMENT;
|
||||
} catch (const std::exception &e) {
|
||||
spdlog::error("Unexpected error during mg API call: {}", e.what());
|
||||
return mgp_error::MGP_ERROR_UNKNOWN_ERROR;
|
||||
} catch (...) {
|
||||
spdlog::error("Unexpected error during mg API call");
|
||||
return mgp_error::MGP_ERROR_UNKNOWN_ERROR;
|
||||
|
@ -49,6 +49,9 @@ struct ReplicationQueryHandler {
|
||||
virtual bool SetReplicationRoleReplica(const memgraph::replication::ReplicationServerConfig &config,
|
||||
const std::optional<utils::UUID> &main_uuid) = 0;
|
||||
|
||||
virtual bool TrySetReplicationRoleReplica(const memgraph::replication::ReplicationServerConfig &config,
|
||||
const std::optional<utils::UUID> &main_uuid) = 0;
|
||||
|
||||
// as MAIN, define and connect to REPLICAs
|
||||
virtual auto TryRegisterReplica(const memgraph::replication::ReplicationClientConfig &config, bool send_swap_uuid)
|
||||
-> utils::BasicResult<RegisterReplicaError> = 0;
|
||||
|
@ -39,7 +39,8 @@ enum class RegisterReplicaError : uint8_t { NAME_EXISTS, ENDPOINT_EXISTS, COULD_
|
||||
|
||||
struct RoleMainData {
|
||||
RoleMainData() = default;
|
||||
explicit RoleMainData(ReplicationEpoch e, std::optional<utils::UUID> uuid = std::nullopt) : epoch_(std::move(e)) {
|
||||
explicit RoleMainData(ReplicationEpoch e, bool writing_enabled, std::optional<utils::UUID> uuid = std::nullopt)
|
||||
: epoch_(std::move(e)), writing_enabled_(writing_enabled) {
|
||||
if (uuid) {
|
||||
uuid_ = *uuid;
|
||||
}
|
||||
@ -54,6 +55,7 @@ struct RoleMainData {
|
||||
ReplicationEpoch epoch_;
|
||||
std::list<ReplicationClient> registered_replicas_{}; // TODO: data race issues
|
||||
utils::UUID uuid_;
|
||||
bool writing_enabled_{false};
|
||||
};
|
||||
|
||||
struct RoleReplicaData {
|
||||
@ -90,6 +92,21 @@ struct ReplicationState {
|
||||
bool IsMain() const { return GetRole() == replication_coordination_glue::ReplicationRole::MAIN; }
|
||||
bool IsReplica() const { return GetRole() == replication_coordination_glue::ReplicationRole::REPLICA; }
|
||||
|
||||
auto IsMainWriteable() const -> bool {
|
||||
if (auto const *main = std::get_if<RoleMainData>(&replication_data_)) {
|
||||
return main->writing_enabled_;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
auto EnableWritingOnMain() -> bool {
|
||||
if (auto *main = std::get_if<RoleMainData>(&replication_data_)) {
|
||||
main->writing_enabled_ = true;
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
bool HasDurability() const { return nullptr != durability_; }
|
||||
|
||||
bool TryPersistRoleMain(std::string new_epoch, utils::UUID main_uuid);
|
||||
|
@ -57,9 +57,17 @@ ReplicationState::ReplicationState(std::optional<std::filesystem::path> durabili
|
||||
auto replication_data = std::move(fetched_replication_data).GetValue();
|
||||
#ifdef MG_ENTERPRISE
|
||||
if (FLAGS_coordinator_server_port && std::holds_alternative<RoleReplicaData>(replication_data)) {
|
||||
spdlog::trace("Restarted replication uuid for replica");
|
||||
std::get<RoleReplicaData>(replication_data).uuid_.reset();
|
||||
}
|
||||
#endif
|
||||
if (std::holds_alternative<RoleReplicaData>(replication_data)) {
|
||||
auto &replica_uuid = std::get<RoleReplicaData>(replication_data).uuid_;
|
||||
std::string uuid = replica_uuid.has_value() ? std::string(replica_uuid.value()) : "";
|
||||
spdlog::trace("Recovered main's uuid for replica {}", uuid);
|
||||
} else {
|
||||
spdlog::trace("Recovered uuid for main {}", std::string(std::get<RoleMainData>(replication_data).uuid_));
|
||||
}
|
||||
replication_data_ = std::move(replication_data);
|
||||
}
|
||||
|
||||
@ -137,8 +145,8 @@ auto ReplicationState::FetchReplicationData() -> FetchReplicationResult_t {
|
||||
return std::visit(
|
||||
utils::Overloaded{
|
||||
[&](durability::MainRole &&r) -> FetchReplicationResult_t {
|
||||
auto res =
|
||||
RoleMainData{std::move(r.epoch), r.main_uuid.has_value() ? r.main_uuid.value() : utils::UUID{}};
|
||||
auto res = RoleMainData{std::move(r.epoch), false,
|
||||
r.main_uuid.has_value() ? r.main_uuid.value() : utils::UUID{}};
|
||||
auto b = durability_->begin(durability::kReplicationReplicaPrefix);
|
||||
auto e = durability_->end(durability::kReplicationReplicaPrefix);
|
||||
for (; b != e; ++b) {
|
||||
@ -246,7 +254,7 @@ bool ReplicationState::SetReplicationRoleMain(const utils::UUID &main_uuid) {
|
||||
return false;
|
||||
}
|
||||
|
||||
replication_data_ = RoleMainData{ReplicationEpoch{new_epoch}, main_uuid};
|
||||
replication_data_ = RoleMainData{ReplicationEpoch{new_epoch}, true, main_uuid};
|
||||
|
||||
return true;
|
||||
}
|
||||
|
@ -12,7 +12,9 @@
|
||||
|
||||
#include "auth/auth.hpp"
|
||||
#include "dbms/dbms_handler.hpp"
|
||||
#include "flags/experimental.hpp"
|
||||
#include "replication/include/replication/state.hpp"
|
||||
#include "replication_handler/system_replication.hpp"
|
||||
#include "replication_handler/system_rpc.hpp"
|
||||
#include "utils/result.hpp"
|
||||
|
||||
@ -22,8 +24,8 @@ inline std::optional<query::RegisterReplicaError> HandleRegisterReplicaStatus(
|
||||
utils::BasicResult<replication::RegisterReplicaError, replication::ReplicationClient *> &instance_client);
|
||||
|
||||
#ifdef MG_ENTERPRISE
|
||||
void StartReplicaClient(replication::ReplicationClient &client, dbms::DbmsHandler &dbms_handler, utils::UUID main_uuid,
|
||||
system::System *system, auth::SynchedAuth &auth);
|
||||
void StartReplicaClient(replication::ReplicationClient &client, system::System &system, dbms::DbmsHandler &dbms_handler,
|
||||
utils::UUID main_uuid, auth::SynchedAuth &auth);
|
||||
#else
|
||||
void StartReplicaClient(replication::ReplicationClient &client, dbms::DbmsHandler &dbms_handler, utils::UUID main_uuid);
|
||||
#endif
|
||||
@ -33,8 +35,8 @@ void StartReplicaClient(replication::ReplicationClient &client, dbms::DbmsHandle
|
||||
// When being called by interpreter no need to gain lock, it should already be under a system transaction
|
||||
// But concurrently the FrequentCheck is running and will need to lock before reading last_committed_system_timestamp_
|
||||
template <bool REQUIRE_LOCK = false>
|
||||
void SystemRestore(replication::ReplicationClient &client, dbms::DbmsHandler &dbms_handler,
|
||||
const utils::UUID &main_uuid, system::System *system, auth::SynchedAuth &auth) {
|
||||
void SystemRestore(replication::ReplicationClient &client, system::System &system, dbms::DbmsHandler &dbms_handler,
|
||||
const utils::UUID &main_uuid, auth::SynchedAuth &auth) {
|
||||
// Check if system is up to date
|
||||
if (client.state_.WithLock(
|
||||
[](auto &state) { return state == memgraph::replication::ReplicationClient::State::READY; }))
|
||||
@ -42,6 +44,10 @@ void SystemRestore(replication::ReplicationClient &client, dbms::DbmsHandler &db
|
||||
|
||||
// Try to recover...
|
||||
{
|
||||
using enum memgraph::flags::Experiments;
|
||||
bool full_system_replication =
|
||||
flags::AreExperimentsEnabled(SYSTEM_REPLICATION) && license::global_license_checker.IsEnterpriseValidFast();
|
||||
// We still need to system replicate
|
||||
struct DbInfo {
|
||||
std::vector<storage::SalientConfig> configs;
|
||||
uint64_t last_committed_timestamp;
|
||||
@ -49,25 +55,25 @@ void SystemRestore(replication::ReplicationClient &client, dbms::DbmsHandler &db
|
||||
DbInfo db_info = std::invoke([&] {
|
||||
auto guard = std::invoke([&]() -> std::optional<memgraph::system::TransactionGuard> {
|
||||
if constexpr (REQUIRE_LOCK) {
|
||||
return system->GenTransactionGuard();
|
||||
return system.GenTransactionGuard();
|
||||
}
|
||||
return std::nullopt;
|
||||
});
|
||||
|
||||
if (license::global_license_checker.IsEnterpriseValidFast()) {
|
||||
if (full_system_replication) {
|
||||
auto configs = std::vector<storage::SalientConfig>{};
|
||||
dbms_handler.ForEach([&configs](dbms::DatabaseAccess acc) { configs.emplace_back(acc->config().salient); });
|
||||
// TODO: This is `SystemRestore` maybe DbInfo is incorrect as it will need Auth also
|
||||
return DbInfo{configs, system->LastCommittedSystemTimestamp()};
|
||||
return DbInfo{configs, system.LastCommittedSystemTimestamp()};
|
||||
}
|
||||
|
||||
// No license -> send only default config
|
||||
return DbInfo{{dbms_handler.Get()->config().salient}, system->LastCommittedSystemTimestamp()};
|
||||
return DbInfo{{dbms_handler.Get()->config().salient}, system.LastCommittedSystemTimestamp()};
|
||||
});
|
||||
try {
|
||||
auto stream = std::invoke([&]() {
|
||||
// Handle only default database is no license
|
||||
if (!license::global_license_checker.IsEnterpriseValidFast()) {
|
||||
if (!full_system_replication) {
|
||||
return client.rpc_client_.Stream<replication::SystemRecoveryRpc>(
|
||||
main_uuid, db_info.last_committed_timestamp, std::move(db_info.configs), auth::Auth::Config{},
|
||||
std::vector<auth::User>{}, std::vector<auth::Role>{});
|
||||
@ -98,7 +104,7 @@ void SystemRestore(replication::ReplicationClient &client, dbms::DbmsHandler &db
|
||||
struct ReplicationHandler : public memgraph::query::ReplicationQueryHandler {
|
||||
#ifdef MG_ENTERPRISE
|
||||
explicit ReplicationHandler(memgraph::replication::ReplicationState &repl_state,
|
||||
memgraph::dbms::DbmsHandler &dbms_handler, memgraph::system::System *system,
|
||||
memgraph::dbms::DbmsHandler &dbms_handler, memgraph::system::System &system,
|
||||
memgraph::auth::SynchedAuth &auth);
|
||||
#else
|
||||
explicit ReplicationHandler(memgraph::replication::ReplicationState &repl_state,
|
||||
@ -108,10 +114,14 @@ struct ReplicationHandler : public memgraph::query::ReplicationQueryHandler {
|
||||
// as REPLICA, become MAIN
|
||||
bool SetReplicationRoleMain() override;
|
||||
|
||||
// as MAIN, become REPLICA
|
||||
// as MAIN, become REPLICA, can be called on MAIN and REPLICA
|
||||
bool SetReplicationRoleReplica(const memgraph::replication::ReplicationServerConfig &config,
|
||||
const std::optional<utils::UUID> &main_uuid) override;
|
||||
|
||||
// as MAIN, become REPLICA, can be called only on MAIN
|
||||
bool TrySetReplicationRoleReplica(const memgraph::replication::ReplicationServerConfig &config,
|
||||
const std::optional<utils::UUID> &main_uuid) override;
|
||||
|
||||
// as MAIN, define and connect to REPLICAs
|
||||
auto TryRegisterReplica(const memgraph::replication::ReplicationClientConfig &config, bool send_swap_uuid)
|
||||
-> memgraph::utils::BasicResult<memgraph::query::RegisterReplicaError> override;
|
||||
@ -132,12 +142,13 @@ struct ReplicationHandler : public memgraph::query::ReplicationQueryHandler {
|
||||
auto GetReplState() const -> const memgraph::replication::ReplicationState &;
|
||||
auto GetReplState() -> memgraph::replication::ReplicationState &;
|
||||
|
||||
auto GetReplicaUUID() -> std::optional<utils::UUID>;
|
||||
|
||||
private:
|
||||
template <bool HandleFailure>
|
||||
template <bool AllowRPCFailure>
|
||||
auto RegisterReplica_(const memgraph::replication::ReplicationClientConfig &config, bool send_swap_uuid)
|
||||
-> memgraph::utils::BasicResult<memgraph::query::RegisterReplicaError> {
|
||||
MG_ASSERT(repl_state_.IsMain(), "Only main instance can register a replica!");
|
||||
|
||||
auto maybe_client = repl_state_.RegisterReplica(config);
|
||||
if (maybe_client.HasError()) {
|
||||
switch (maybe_client.GetError()) {
|
||||
@ -154,79 +165,129 @@ struct ReplicationHandler : public memgraph::query::ReplicationQueryHandler {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (!memgraph::dbms::allow_mt_repl && dbms_handler_.All().size() > 1) {
|
||||
using enum memgraph::flags::Experiments;
|
||||
bool system_replication_enabled = flags::AreExperimentsEnabled(SYSTEM_REPLICATION);
|
||||
if (!system_replication_enabled && dbms_handler_.Count() > 1) {
|
||||
spdlog::warn("Multi-tenant replication is currently not supported!");
|
||||
}
|
||||
const auto main_uuid =
|
||||
std::get<memgraph::replication::RoleMainData>(dbms_handler_.ReplicationState().ReplicationData()).uuid_;
|
||||
|
||||
if (send_swap_uuid) {
|
||||
if (!memgraph::replication_coordination_glue::SendSwapMainUUIDRpc(maybe_client.GetValue()->rpc_client_,
|
||||
main_uuid)) {
|
||||
return memgraph::query::RegisterReplicaError::ERROR_ACCEPTING_MAIN;
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef MG_ENTERPRISE
|
||||
// Update system before enabling individual storage <-> replica clients
|
||||
SystemRestore(*maybe_client.GetValue(), dbms_handler_, main_uuid, system_, auth_);
|
||||
SystemRestore(*maybe_client.GetValue(), system_, dbms_handler_, main_uuid, auth_);
|
||||
#endif
|
||||
|
||||
const auto dbms_error = HandleRegisterReplicaStatus(maybe_client);
|
||||
if (dbms_error.has_value()) {
|
||||
return *dbms_error;
|
||||
}
|
||||
auto &instance_client_ptr = maybe_client.GetValue();
|
||||
|
||||
bool all_clients_good = true;
|
||||
// Add database specific clients (NOTE Currently all databases are connected to each replica)
|
||||
dbms_handler_.ForEach([&](dbms::DatabaseAccess db_acc) {
|
||||
auto *storage = db_acc->storage();
|
||||
if (!dbms::allow_mt_repl && storage->name() != dbms::kDefaultDB) {
|
||||
if (!system_replication_enabled && storage->name() != dbms::kDefaultDB) {
|
||||
return;
|
||||
}
|
||||
// TODO: ATM only IN_MEMORY_TRANSACTIONAL, fix other modes
|
||||
if (storage->storage_mode_ != storage::StorageMode::IN_MEMORY_TRANSACTIONAL) return;
|
||||
|
||||
all_clients_good &= storage->repl_storage_state_.replication_clients_.WithLock(
|
||||
[storage, &instance_client_ptr, db_acc = std::move(db_acc),
|
||||
main_uuid](auto &storage_clients) mutable { // NOLINT
|
||||
auto client = std::make_unique<storage::ReplicationStorageClient>(*instance_client_ptr, main_uuid);
|
||||
// All good, start replica client
|
||||
client->Start(storage, std::move(db_acc));
|
||||
// After start the storage <-> replica state should be READY or RECOVERING (if correctly started)
|
||||
// MAYBE_BEHIND isn't a statement of the current state, this is the default value
|
||||
// Failed to start due an error like branching of MAIN and REPLICA
|
||||
const bool success = client->State() != storage::replication::ReplicaState::MAYBE_BEHIND;
|
||||
if (HandleFailure || success) {
|
||||
bool const success = std::invoke([state = client->State()]() {
|
||||
if (state == storage::replication::ReplicaState::DIVERGED_FROM_MAIN) {
|
||||
return false;
|
||||
}
|
||||
if (state == storage::replication::ReplicaState::MAYBE_BEHIND) {
|
||||
return AllowRPCFailure;
|
||||
}
|
||||
return true;
|
||||
});
|
||||
|
||||
if (success) {
|
||||
storage_clients.push_back(std::move(client));
|
||||
}
|
||||
return success;
|
||||
});
|
||||
});
|
||||
|
||||
// NOTE Currently if any databases fails, we revert back
|
||||
if (!HandleFailure && !all_clients_good) {
|
||||
if (!all_clients_good) {
|
||||
spdlog::error("Failed to register all databases on the REPLICA \"{}\"", config.name);
|
||||
UnregisterReplica(config.name);
|
||||
return memgraph::query::RegisterReplicaError::CONNECTION_FAILED;
|
||||
}
|
||||
|
||||
// No client error, start instance level client
|
||||
#ifdef MG_ENTERPRISE
|
||||
StartReplicaClient(*instance_client_ptr, dbms_handler_, main_uuid, system_, auth_);
|
||||
StartReplicaClient(*instance_client_ptr, system_, dbms_handler_, main_uuid, auth_);
|
||||
#else
|
||||
StartReplicaClient(*instance_client_ptr, dbms_handler_, main_uuid);
|
||||
#endif
|
||||
return {};
|
||||
}
|
||||
|
||||
template <bool AllowIdempotency>
|
||||
bool SetReplicationRoleReplica_(const memgraph::replication::ReplicationServerConfig &config,
|
||||
const std::optional<utils::UUID> &main_uuid) {
|
||||
if (repl_state_.IsReplica()) {
|
||||
if (!AllowIdempotency) {
|
||||
return false;
|
||||
}
|
||||
// We don't want to restart the server if we're already a REPLICA with correct config
|
||||
auto &replica_data = std::get<memgraph::replication::RoleReplicaData>(repl_state_.ReplicationData());
|
||||
if (replica_data.config == config) {
|
||||
return true;
|
||||
}
|
||||
repl_state_.SetReplicationRoleReplica(config, main_uuid);
|
||||
#ifdef MG_ENTERPRISE
|
||||
return StartRpcServer(dbms_handler_, replica_data, auth_, system_);
|
||||
#else
|
||||
return StartRpcServer(dbms_handler_, replica_data);
|
||||
#endif
|
||||
}
|
||||
|
||||
// TODO StorageState needs to be synched. Could have a dangling reference if someone adds a database as we are
|
||||
// deleting the replica.
|
||||
// Remove database specific clients
|
||||
dbms_handler_.ForEach([&](memgraph::dbms::DatabaseAccess db_acc) {
|
||||
auto *storage = db_acc->storage();
|
||||
storage->repl_storage_state_.replication_clients_.WithLock([](auto &clients) { clients.clear(); });
|
||||
});
|
||||
// Remove instance level clients
|
||||
std::get<memgraph::replication::RoleMainData>(repl_state_.ReplicationData()).registered_replicas_.clear();
|
||||
|
||||
// Creates the server
|
||||
repl_state_.SetReplicationRoleReplica(config, main_uuid);
|
||||
|
||||
// Start
|
||||
const auto success =
|
||||
std::visit(memgraph::utils::Overloaded{[](memgraph::replication::RoleMainData &) {
|
||||
// ASSERT
|
||||
return false;
|
||||
},
|
||||
[this](memgraph::replication::RoleReplicaData &data) {
|
||||
#ifdef MG_ENTERPRISE
|
||||
return StartRpcServer(dbms_handler_, data, auth_, system_);
|
||||
#else
|
||||
return StartRpcServer(dbms_handler_, data);
|
||||
#endif
|
||||
}},
|
||||
repl_state_.ReplicationData());
|
||||
// TODO Handle error (restore to main?)
|
||||
return success;
|
||||
}
|
||||
|
||||
memgraph::replication::ReplicationState &repl_state_;
|
||||
memgraph::dbms::DbmsHandler &dbms_handler_;
|
||||
|
||||
#ifdef MG_ENTERPRISE
|
||||
memgraph::system::System *system_;
|
||||
memgraph::system::System &system_;
|
||||
memgraph::auth::SynchedAuth &auth_;
|
||||
#endif
|
||||
};
|
||||
|
@ -27,11 +27,16 @@ inline void LogWrongMain(const std::optional<utils::UUID> ¤t_main_uuid, co
|
||||
#ifdef MG_ENTERPRISE
|
||||
void SystemHeartbeatHandler(uint64_t ts, const std::optional<utils::UUID> ¤t_main_uuid, slk::Reader *req_reader,
|
||||
slk::Builder *res_builder);
|
||||
|
||||
void SystemRecoveryHandler(memgraph::system::ReplicaHandlerAccessToState &system_state_access,
|
||||
std::optional<utils::UUID> ¤t_main_uuid, dbms::DbmsHandler &dbms_handler,
|
||||
auth::SynchedAuth &auth, slk::Reader *req_reader, slk::Builder *res_builder);
|
||||
void Register(replication::RoleReplicaData const &data, dbms::DbmsHandler &dbms_handler, auth::SynchedAuth &auth);
|
||||
bool StartRpcServer(dbms::DbmsHandler &dbms_handler, replication::RoleReplicaData &data, auth::SynchedAuth &auth);
|
||||
|
||||
void Register(replication::RoleReplicaData const &data, system::System &system, dbms::DbmsHandler &dbms_handler,
|
||||
auth::SynchedAuth &auth);
|
||||
|
||||
bool StartRpcServer(dbms::DbmsHandler &dbms_handler, replication::RoleReplicaData &data, auth::SynchedAuth &auth,
|
||||
system::System &system);
|
||||
#else
|
||||
bool StartRpcServer(dbms::DbmsHandler &dbms_handler, replication::RoleReplicaData &data);
|
||||
#endif
|
||||
|
@ -17,25 +17,25 @@ namespace memgraph::replication {
|
||||
|
||||
namespace {
|
||||
#ifdef MG_ENTERPRISE
|
||||
void RecoverReplication(memgraph::replication::ReplicationState &repl_state, memgraph::system::System *system,
|
||||
void RecoverReplication(memgraph::replication::ReplicationState &repl_state, memgraph::system::System &system,
|
||||
memgraph::dbms::DbmsHandler &dbms_handler, memgraph::auth::SynchedAuth &auth) {
|
||||
/*
|
||||
* REPLICATION RECOVERY AND STARTUP
|
||||
*/
|
||||
|
||||
// Startup replication state (if recovered at startup)
|
||||
auto replica = [&dbms_handler, &auth](memgraph::replication::RoleReplicaData &data) {
|
||||
return StartRpcServer(dbms_handler, data, auth);
|
||||
auto replica = [&dbms_handler, &auth, &system](memgraph::replication::RoleReplicaData &data) {
|
||||
return memgraph::replication::StartRpcServer(dbms_handler, data, auth, system);
|
||||
};
|
||||
|
||||
// Replication recovery and frequent check start
|
||||
auto main = [system, &dbms_handler, &auth](memgraph::replication::RoleMainData &mainData) {
|
||||
auto main = [&system, &dbms_handler, &auth](memgraph::replication::RoleMainData &mainData) {
|
||||
for (auto &client : mainData.registered_replicas_) {
|
||||
if (client.try_set_uuid &&
|
||||
replication_coordination_glue::SendSwapMainUUIDRpc(client.rpc_client_, mainData.uuid_)) {
|
||||
client.try_set_uuid = false;
|
||||
}
|
||||
SystemRestore(client, dbms_handler, mainData.uuid_, system, auth);
|
||||
SystemRestore(client, system, dbms_handler, mainData.uuid_, auth);
|
||||
}
|
||||
// DBMS here
|
||||
dbms_handler.ForEach([&mainData](memgraph::dbms::DatabaseAccess db_acc) {
|
||||
@ -43,7 +43,7 @@ void RecoverReplication(memgraph::replication::ReplicationState &repl_state, mem
|
||||
});
|
||||
|
||||
for (auto &client : mainData.registered_replicas_) {
|
||||
StartReplicaClient(client, dbms_handler, mainData.uuid_, system, auth);
|
||||
StartReplicaClient(client, system, dbms_handler, mainData.uuid_, auth);
|
||||
}
|
||||
|
||||
// Warning
|
||||
@ -120,8 +120,8 @@ inline std::optional<query::RegisterReplicaError> HandleRegisterReplicaStatus(
|
||||
}
|
||||
|
||||
#ifdef MG_ENTERPRISE
|
||||
void StartReplicaClient(replication::ReplicationClient &client, dbms::DbmsHandler &dbms_handler, utils::UUID main_uuid,
|
||||
system::System *system, auth::SynchedAuth &auth) {
|
||||
void StartReplicaClient(replication::ReplicationClient &client, system::System &system, dbms::DbmsHandler &dbms_handler,
|
||||
utils::UUID main_uuid, auth::SynchedAuth &auth) {
|
||||
#else
|
||||
void StartReplicaClient(replication::ReplicationClient &client, dbms::DbmsHandler &dbms_handler,
|
||||
utils::UUID main_uuid) {
|
||||
@ -129,12 +129,8 @@ void StartReplicaClient(replication::ReplicationClient &client, dbms::DbmsHandle
|
||||
// No client error, start instance level client
|
||||
auto const &endpoint = client.rpc_client_.Endpoint();
|
||||
spdlog::trace("Replication client started at: {}:{}", endpoint.address, endpoint.port);
|
||||
client.StartFrequentCheck([&,
|
||||
#ifdef MG_ENTERPRISE
|
||||
system = system,
|
||||
#endif
|
||||
license = license::global_license_checker.IsEnterpriseValidFast(),
|
||||
main_uuid](bool reconnect, replication::ReplicationClient &client) mutable {
|
||||
client.StartFrequentCheck([&, license = license::global_license_checker.IsEnterpriseValidFast(), main_uuid](
|
||||
bool reconnect, replication::ReplicationClient &client) mutable {
|
||||
if (client.try_set_uuid &&
|
||||
memgraph::replication_coordination_glue::SendSwapMainUUIDRpc(client.rpc_client_, main_uuid)) {
|
||||
client.try_set_uuid = false;
|
||||
@ -151,7 +147,7 @@ void StartReplicaClient(replication::ReplicationClient &client, dbms::DbmsHandle
|
||||
client.state_.WithLock([](auto &state) { state = memgraph::replication::ReplicationClient::State::BEHIND; });
|
||||
}
|
||||
#ifdef MG_ENTERPRISE
|
||||
SystemRestore<true>(client, dbms_handler, main_uuid, system, auth);
|
||||
SystemRestore<true>(client, system, dbms_handler, main_uuid, auth);
|
||||
#endif
|
||||
// Check if any database has been left behind
|
||||
dbms_handler.ForEach([&name = client.name_, reconnect](dbms::DatabaseAccess db_acc) {
|
||||
@ -168,7 +164,7 @@ void StartReplicaClient(replication::ReplicationClient &client, dbms::DbmsHandle
|
||||
|
||||
#ifdef MG_ENTERPRISE
|
||||
ReplicationHandler::ReplicationHandler(memgraph::replication::ReplicationState &repl_state,
|
||||
memgraph::dbms::DbmsHandler &dbms_handler, memgraph::system::System *system,
|
||||
memgraph::dbms::DbmsHandler &dbms_handler, memgraph::system::System &system,
|
||||
memgraph::auth::SynchedAuth &auth)
|
||||
: repl_state_{repl_state}, dbms_handler_{dbms_handler}, system_{system}, auth_{auth} {
|
||||
RecoverReplication(repl_state_, system_, dbms_handler_, auth_);
|
||||
@ -196,39 +192,12 @@ bool ReplicationHandler::SetReplicationRoleMain() {
|
||||
|
||||
bool ReplicationHandler::SetReplicationRoleReplica(const memgraph::replication::ReplicationServerConfig &config,
|
||||
const std::optional<utils::UUID> &main_uuid) {
|
||||
// We don't want to restart the server if we're already a REPLICA
|
||||
if (repl_state_.IsReplica()) {
|
||||
return false;
|
||||
}
|
||||
return SetReplicationRoleReplica_<false>(config, main_uuid);
|
||||
}
|
||||
|
||||
// TODO StorageState needs to be synched. Could have a dangling reference if someone adds a database as we are
|
||||
// deleting the replica.
|
||||
// Remove database specific clients
|
||||
dbms_handler_.ForEach([&](memgraph::dbms::DatabaseAccess db_acc) {
|
||||
auto *storage = db_acc->storage();
|
||||
storage->repl_storage_state_.replication_clients_.WithLock([](auto &clients) { clients.clear(); });
|
||||
});
|
||||
// Remove instance level clients
|
||||
std::get<memgraph::replication::RoleMainData>(repl_state_.ReplicationData()).registered_replicas_.clear();
|
||||
|
||||
// Creates the server
|
||||
repl_state_.SetReplicationRoleReplica(config, main_uuid);
|
||||
|
||||
// Start
|
||||
const auto success = std::visit(memgraph::utils::Overloaded{[](memgraph::replication::RoleMainData &) {
|
||||
// ASSERT
|
||||
return false;
|
||||
},
|
||||
[this](memgraph::replication::RoleReplicaData &data) {
|
||||
#ifdef MG_ENTERPRISE
|
||||
return StartRpcServer(dbms_handler_, data, auth_);
|
||||
#else
|
||||
return StartRpcServer(dbms_handler_, data);
|
||||
#endif
|
||||
}},
|
||||
repl_state_.ReplicationData());
|
||||
// TODO Handle error (restore to main?)
|
||||
return success;
|
||||
bool ReplicationHandler::TrySetReplicationRoleReplica(const memgraph::replication::ReplicationServerConfig &config,
|
||||
const std::optional<utils::UUID> &main_uuid) {
|
||||
return SetReplicationRoleReplica_<true>(config, main_uuid);
|
||||
}
|
||||
|
||||
bool ReplicationHandler::DoReplicaToMainPromotion(const utils::UUID &main_uuid) {
|
||||
@ -260,13 +229,13 @@ bool ReplicationHandler::DoReplicaToMainPromotion(const utils::UUID &main_uuid)
|
||||
auto ReplicationHandler::TryRegisterReplica(const memgraph::replication::ReplicationClientConfig &config,
|
||||
bool send_swap_uuid)
|
||||
-> memgraph::utils::BasicResult<memgraph::query::RegisterReplicaError> {
|
||||
return RegisterReplica_<false>(config, send_swap_uuid);
|
||||
return RegisterReplica_<true>(config, send_swap_uuid);
|
||||
}
|
||||
|
||||
auto ReplicationHandler::RegisterReplica(const memgraph::replication::ReplicationClientConfig &config,
|
||||
bool send_swap_uuid)
|
||||
-> memgraph::utils::BasicResult<memgraph::query::RegisterReplicaError> {
|
||||
return RegisterReplica_<true>(config, send_swap_uuid);
|
||||
return RegisterReplica_<false>(config, send_swap_uuid);
|
||||
}
|
||||
|
||||
auto ReplicationHandler::UnregisterReplica(std::string_view name) -> memgraph::query::UnregisterReplicaResult {
|
||||
@ -299,6 +268,11 @@ auto ReplicationHandler::GetRole() const -> memgraph::replication_coordination_g
|
||||
return repl_state_.GetRole();
|
||||
}
|
||||
|
||||
auto ReplicationHandler::GetReplicaUUID() -> std::optional<utils::UUID> {
|
||||
MG_ASSERT(repl_state_.IsReplica());
|
||||
return std::get<RoleReplicaData>(repl_state_.ReplicationData()).uuid_;
|
||||
}
|
||||
|
||||
auto ReplicationHandler::GetReplState() const -> const memgraph::replication::ReplicationState & { return repl_state_; }
|
||||
|
||||
auto ReplicationHandler::GetReplState() -> memgraph::replication::ReplicationState & { return repl_state_; }
|
||||
|
@ -15,6 +15,7 @@
|
||||
|
||||
#include "auth/replication_handlers.hpp"
|
||||
#include "dbms/replication_handlers.hpp"
|
||||
#include "flags/experimental.hpp"
|
||||
#include "license/license.hpp"
|
||||
#include "replication_handler/system_rpc.hpp"
|
||||
|
||||
@ -56,10 +57,24 @@ void SystemRecoveryHandler(memgraph::system::ReplicaHandlerAccessToState &system
|
||||
memgraph::replication::SystemRecoveryReq req;
|
||||
memgraph::slk::Load(&req, req_reader);
|
||||
|
||||
using enum memgraph::flags::Experiments;
|
||||
auto experimental_system_replication = flags::AreExperimentsEnabled(SYSTEM_REPLICATION);
|
||||
|
||||
// validate
|
||||
if (!current_main_uuid.has_value() || req.main_uuid != current_main_uuid) [[unlikely]] {
|
||||
LogWrongMain(current_main_uuid, req.main_uuid, SystemRecoveryReq::kType.name);
|
||||
return;
|
||||
}
|
||||
if (!experimental_system_replication) {
|
||||
if (req.database_configs.size() != 1 && req.database_configs[0].name != dbms::kDefaultDB) {
|
||||
// a partial system recovery should be only be updating the default database uuid
|
||||
return; // Failure sent on exit
|
||||
}
|
||||
if (!req.users.empty() || !req.roles.empty()) {
|
||||
// a partial system recovery should not be updating any users or roles
|
||||
return; // Failure sent on exit
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* DBMS
|
||||
@ -69,7 +84,9 @@ void SystemRecoveryHandler(memgraph::system::ReplicaHandlerAccessToState &system
|
||||
/*
|
||||
* AUTH
|
||||
*/
|
||||
if (!auth::SystemRecoveryHandler(auth, req.auth_config, req.users, req.roles)) return; // Failure sent on exit
|
||||
if (experimental_system_replication) {
|
||||
if (!auth::SystemRecoveryHandler(auth, req.auth_config, req.users, req.roles)) return; // Failure sent on exit
|
||||
}
|
||||
|
||||
/*
|
||||
* SUCCESSFUL RECOVERY
|
||||
@ -79,35 +96,44 @@ void SystemRecoveryHandler(memgraph::system::ReplicaHandlerAccessToState &system
|
||||
res = SystemRecoveryRes(SystemRecoveryRes::Result::SUCCESS);
|
||||
}
|
||||
|
||||
void Register(replication::RoleReplicaData const &data, dbms::DbmsHandler &dbms_handler, auth::SynchedAuth &auth) {
|
||||
void Register(replication::RoleReplicaData const &data, system::System &system, dbms::DbmsHandler &dbms_handler,
|
||||
auth::SynchedAuth &auth) {
|
||||
// NOTE: Register even without license as the user could add a license at run-time
|
||||
// TODO: fix Register when system is removed from DbmsHandler
|
||||
|
||||
auto system_state_access = dbms_handler.system_->CreateSystemStateAccess();
|
||||
auto system_state_access = system.CreateSystemStateAccess();
|
||||
|
||||
using enum memgraph::flags::Experiments;
|
||||
auto experimental_system_replication = flags::AreExperimentsEnabled(SYSTEM_REPLICATION);
|
||||
|
||||
// System
|
||||
// TODO: remove, as this is not used
|
||||
data.server->rpc_server_.Register<replication::SystemHeartbeatRpc>(
|
||||
[&data, system_state_access](auto *req_reader, auto *res_builder) {
|
||||
spdlog::debug("Received SystemHeartbeatRpc");
|
||||
SystemHeartbeatHandler(system_state_access.LastCommitedTS(), data.uuid_, req_reader, res_builder);
|
||||
});
|
||||
if (experimental_system_replication) {
|
||||
data.server->rpc_server_.Register<replication::SystemHeartbeatRpc>(
|
||||
[&data, system_state_access](auto *req_reader, auto *res_builder) {
|
||||
spdlog::debug("Received SystemHeartbeatRpc");
|
||||
SystemHeartbeatHandler(system_state_access.LastCommitedTS(), data.uuid_, req_reader, res_builder);
|
||||
});
|
||||
}
|
||||
// Needed even with experimental_system_replication=false becasue
|
||||
// need to tell REPLICA the uuid to use for "memgraph" default database
|
||||
data.server->rpc_server_.Register<replication::SystemRecoveryRpc>(
|
||||
[&data, system_state_access, &dbms_handler, &auth](auto *req_reader, auto *res_builder) mutable {
|
||||
spdlog::debug("Received SystemRecoveryRpc");
|
||||
SystemRecoveryHandler(system_state_access, data.uuid_, dbms_handler, auth, req_reader, res_builder);
|
||||
});
|
||||
|
||||
// DBMS
|
||||
dbms::Register(data, system_state_access, dbms_handler);
|
||||
if (experimental_system_replication) {
|
||||
// DBMS
|
||||
dbms::Register(data, system_state_access, dbms_handler);
|
||||
|
||||
// Auth
|
||||
auth::Register(data, system_state_access, auth);
|
||||
// Auth
|
||||
auth::Register(data, system_state_access, auth);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef MG_ENTERPRISE
|
||||
bool StartRpcServer(dbms::DbmsHandler &dbms_handler, replication::RoleReplicaData &data, auth::SynchedAuth &auth) {
|
||||
bool StartRpcServer(dbms::DbmsHandler &dbms_handler, replication::RoleReplicaData &data, auth::SynchedAuth &auth,
|
||||
system::System &system) {
|
||||
#else
|
||||
bool StartRpcServer(dbms::DbmsHandler &dbms_handler, replication::RoleReplicaData &data) {
|
||||
#endif
|
||||
@ -115,7 +141,7 @@ bool StartRpcServer(dbms::DbmsHandler &dbms_handler, replication::RoleReplicaDat
|
||||
dbms::InMemoryReplicationHandlers::Register(&dbms_handler, data);
|
||||
#ifdef MG_ENTERPRISE
|
||||
// Register system handlers
|
||||
Register(data, dbms_handler, auth);
|
||||
Register(data, system, dbms_handler, auth);
|
||||
#endif
|
||||
// Start server
|
||||
if (!data.server->Start()) {
|
||||
|
@ -1,4 +1,4 @@
|
||||
// Copyright 2023 Memgraph Ltd.
|
||||
// Copyright 2024 Memgraph Ltd.
|
||||
//
|
||||
// Use of this software is governed by the Business Source License
|
||||
// included in the file licenses/BSL.txt; by using this file, you agree to be bound by the terms of the Business Source
|
||||
@ -14,6 +14,6 @@
|
||||
|
||||
namespace memgraph::storage::replication {
|
||||
|
||||
enum class ReplicaState : std::uint8_t { READY, REPLICATING, RECOVERY, MAYBE_BEHIND };
|
||||
enum class ReplicaState : std::uint8_t { READY, REPLICATING, RECOVERY, MAYBE_BEHIND, DIVERGED_FROM_MAIN };
|
||||
|
||||
} // namespace memgraph::storage::replication
|
||||
|
@ -69,7 +69,7 @@ void ReplicationStorageClient::UpdateReplicaState(Storage *storage, DatabaseAcce
|
||||
"now hold unique data. Please resolve data conflicts and start the "
|
||||
"replication on a clean instance.",
|
||||
client_.name_, client_.name_, client_.name_);
|
||||
// State not updated, hence in MAYBE_BEHIND state
|
||||
replica_state_.WithLock([](auto &val) { val = replication::ReplicaState::DIVERGED_FROM_MAIN; });
|
||||
return;
|
||||
}
|
||||
|
||||
@ -171,6 +171,10 @@ void ReplicationStorageClient::StartTransactionReplication(const uint64_t curren
|
||||
utils::MessageWithLink("Couldn't replicate data to {}.", client_.name_, "https://memgr.ph/replication"));
|
||||
TryCheckReplicaStateAsync(storage, std::move(db_acc));
|
||||
return;
|
||||
case DIVERGED_FROM_MAIN:
|
||||
spdlog::error(utils::MessageWithLink("Couldn't replicate data to {} since replica has diverged from main.",
|
||||
client_.name_, "https://memgr.ph/replication"));
|
||||
return;
|
||||
case READY:
|
||||
MG_ASSERT(!replica_stream_);
|
||||
try {
|
||||
|
@ -26,12 +26,14 @@ struct ISystemAction {
|
||||
/// Durability step which is defered until commit time
|
||||
virtual void DoDurability() = 0;
|
||||
|
||||
#ifdef MG_ENTERPRISE
|
||||
/// Prepare the RPC payload that will be sent to all replicas clients
|
||||
virtual bool DoReplication(memgraph::replication::ReplicationClient &client, const utils::UUID &main_uuid,
|
||||
memgraph::replication::ReplicationEpoch const &epoch,
|
||||
Transaction const &system_tx) const = 0;
|
||||
|
||||
virtual void PostReplication(memgraph::replication::RoleMainData &main_data) const = 0;
|
||||
#endif
|
||||
|
||||
virtual ~ISystemAction() = default;
|
||||
};
|
||||
|
@ -57,11 +57,13 @@ struct Transaction {
|
||||
/// durability
|
||||
action->DoDurability();
|
||||
|
||||
/// replication prep
|
||||
#ifdef MG_ENTERPRISE
|
||||
/// replication
|
||||
auto action_sync_status = handler.ApplyAction(*action, *this);
|
||||
if (action_sync_status != AllSyncReplicaStatus::AllCommitsConfirmed) {
|
||||
sync_status = AllSyncReplicaStatus::SomeCommitsUnconfirmed;
|
||||
}
|
||||
#endif
|
||||
|
||||
actions_.pop_front();
|
||||
}
|
||||
@ -93,6 +95,7 @@ struct Transaction {
|
||||
std::list<std::unique_ptr<ISystemAction>> actions_;
|
||||
};
|
||||
|
||||
#ifdef MG_ENTERPRISE
|
||||
struct DoReplication {
|
||||
explicit DoReplication(replication::RoleMainData &main_data) : main_data_{main_data} {}
|
||||
auto ApplyAction(ISystemAction const &action, Transaction const &system_tx) -> AllSyncReplicaStatus {
|
||||
@ -113,6 +116,7 @@ struct DoReplication {
|
||||
replication::RoleMainData &main_data_;
|
||||
};
|
||||
static_assert(ReplicationPolicy<DoReplication>);
|
||||
#endif
|
||||
|
||||
struct DoNothing {
|
||||
auto ApplyAction(ISystemAction const & /*action*/, Transaction const & /*system_tx*/) -> AllSyncReplicaStatus {
|
||||
|
@ -1,4 +1,4 @@
|
||||
// Copyright 2023 Memgraph Ltd.
|
||||
// Copyright 2024 Memgraph Ltd.
|
||||
//
|
||||
// Use of this software is governed by the Business Source License
|
||||
// included in the file licenses/BSL.txt; by using this file, you agree to be bound by the terms of the Business Source
|
||||
@ -42,12 +42,26 @@ namespace memgraph::utils {
|
||||
class BasicException : public std::exception {
|
||||
public:
|
||||
/**
|
||||
* @brief Constructor (C++ STL strings).
|
||||
* @brief Constructor (C++ STL strings_view).
|
||||
*
|
||||
* @param message The error message.
|
||||
*/
|
||||
explicit BasicException(std::string_view message) noexcept : msg_(message) {}
|
||||
|
||||
/**
|
||||
* @brief Constructor (string literal).
|
||||
*
|
||||
* @param message The error message.
|
||||
*/
|
||||
explicit BasicException(const char *message) noexcept : msg_(message) {}
|
||||
|
||||
/**
|
||||
* @brief Constructor (C++ STL strings).
|
||||
*
|
||||
* @param message The error message.
|
||||
*/
|
||||
explicit BasicException(std::string message) noexcept : msg_(std::move(message)) {}
|
||||
|
||||
/**
|
||||
* @brief Constructor with format string (C++ STL strings).
|
||||
*
|
||||
|
27
src/utils/functional.hpp
Normal file
27
src/utils/functional.hpp
Normal file
@ -0,0 +1,27 @@
|
||||
// Copyright 2024 Memgraph Ltd.
|
||||
//
|
||||
// Use of this software is governed by the Business Source License
|
||||
// included in the file licenses/BSL.txt; by using this file, you agree to be bound by the terms of the Business Source
|
||||
// License, and you may not use this file except in compliance with the Business Source License.
|
||||
//
|
||||
// As of the Change Date specified in that file, in accordance with
|
||||
// the Business Source License, use of this software will be governed
|
||||
// by the Apache License, Version 2.0, included in the file
|
||||
// licenses/APL.txt.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <algorithm>
|
||||
#include <vector>
|
||||
|
||||
namespace memgraph::utils {
|
||||
|
||||
template <class F, class T, class R = typename std::result_of<F(T)>::type, class V = std::vector<R>>
|
||||
V fmap(F &&f, const std::vector<T> &v) {
|
||||
V r;
|
||||
r.reserve(v.size());
|
||||
std::ranges::transform(v, std::back_inserter(r), std::forward<F>(f));
|
||||
return r;
|
||||
}
|
||||
|
||||
} // namespace memgraph::utils
|
@ -104,7 +104,7 @@ void MemoryTracker::SetMaximumHardLimit(const int64_t limit) {
|
||||
maximum_hard_limit_ = limit;
|
||||
}
|
||||
|
||||
void MemoryTracker::Alloc(const int64_t size) {
|
||||
bool MemoryTracker::Alloc(int64_t const size) {
|
||||
MG_ASSERT(size >= 0, "Negative size passed to the MemoryTracker.");
|
||||
|
||||
const int64_t will_be = size + amount_.fetch_add(size, std::memory_order_relaxed);
|
||||
@ -116,12 +116,13 @@ void MemoryTracker::Alloc(const int64_t size) {
|
||||
|
||||
amount_.fetch_sub(size, std::memory_order_relaxed);
|
||||
|
||||
throw OutOfMemoryException(
|
||||
fmt::format("Memory limit exceeded! Attempting to allocate a chunk of {} which would put the current "
|
||||
"use to {}, while the maximum allowed size for allocation is set to {}.",
|
||||
GetReadableSize(size), GetReadableSize(will_be), GetReadableSize(current_hard_limit)));
|
||||
// register our error data, we will pick this up on the other side of jemalloc
|
||||
MemoryErrorStatus().set({size, will_be, current_hard_limit});
|
||||
|
||||
return false;
|
||||
}
|
||||
UpdatePeak(will_be);
|
||||
return true;
|
||||
}
|
||||
|
||||
void MemoryTracker::DoCheck() {
|
||||
@ -139,4 +140,23 @@ void MemoryTracker::DoCheck() {
|
||||
|
||||
void MemoryTracker::Free(const int64_t size) { amount_.fetch_sub(size, std::memory_order_relaxed); }
|
||||
|
||||
// DEVNOTE: important that this is allocated at thread construction time
|
||||
// otherwise subtle bug where jemalloc will try to lock an non-recursive mutex
|
||||
// that it already owns
|
||||
// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
|
||||
thread_local MemoryTrackerStatus status;
|
||||
auto MemoryErrorStatus() -> MemoryTrackerStatus & { return status; }
|
||||
|
||||
auto MemoryTrackerStatus::msg() -> std::optional<std::string> {
|
||||
if (!data_) return std::nullopt;
|
||||
|
||||
auto [size, will_be, hard_limit] = *data_;
|
||||
data_.reset();
|
||||
return fmt::format(
|
||||
"Memory limit exceeded! Attempting to allocate a chunk of {} which would put the current "
|
||||
"use to {}, while the maximum allowed size for allocation is set to {}.",
|
||||
// NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
|
||||
GetReadableSize(size), GetReadableSize(will_be), GetReadableSize(hard_limit));
|
||||
}
|
||||
|
||||
} // namespace memgraph::utils
|
||||
|
@ -12,15 +12,35 @@
|
||||
#pragma once
|
||||
|
||||
#include <atomic>
|
||||
#include <optional>
|
||||
#include <string>
|
||||
#include <type_traits>
|
||||
|
||||
#include "utils/exceptions.hpp"
|
||||
|
||||
namespace memgraph::utils {
|
||||
|
||||
struct MemoryTrackerStatus {
|
||||
struct data {
|
||||
int64_t size;
|
||||
int64_t will_be;
|
||||
int64_t hard_limit;
|
||||
};
|
||||
|
||||
// DEVNOTE: Do not call from within allocator, will cause another allocation
|
||||
auto msg() -> std::optional<std::string>;
|
||||
|
||||
void set(data d) { data_ = d; }
|
||||
|
||||
private:
|
||||
std::optional<data> data_;
|
||||
};
|
||||
|
||||
auto MemoryErrorStatus() -> MemoryTrackerStatus &;
|
||||
|
||||
class OutOfMemoryException : public utils::BasicException {
|
||||
public:
|
||||
explicit OutOfMemoryException(const std::string &msg) : utils::BasicException(msg) {}
|
||||
explicit OutOfMemoryException(std::string msg) : utils::BasicException(std::move(msg)) {}
|
||||
SPECIALIZE_GET_EXCEPTION_NAME(OutOfMemoryException)
|
||||
};
|
||||
|
||||
@ -47,7 +67,7 @@ class MemoryTracker final {
|
||||
|
||||
MemoryTracker &operator=(MemoryTracker &&) = delete;
|
||||
|
||||
void Alloc(int64_t size);
|
||||
bool Alloc(int64_t size);
|
||||
void Free(int64_t size);
|
||||
void DoCheck();
|
||||
|
||||
|
@ -1,4 +1,4 @@
|
||||
// Copyright 2023 Memgraph Ltd.
|
||||
// Copyright 2024 Memgraph Ltd.
|
||||
//
|
||||
// Use of this software is governed by the Business Source License
|
||||
// included in the file licenses/BSL.txt; by using this file, you agree to be bound by the terms of the Business Source
|
||||
@ -17,18 +17,19 @@
|
||||
|
||||
namespace memgraph::utils {
|
||||
|
||||
void QueryMemoryTracker::TrackAlloc(size_t size) {
|
||||
bool QueryMemoryTracker::TrackAlloc(size_t size) {
|
||||
if (query_tracker_.has_value()) [[likely]] {
|
||||
query_tracker_->Alloc(static_cast<int64_t>(size));
|
||||
bool ok = query_tracker_->Alloc(static_cast<int64_t>(size));
|
||||
if (!ok) return false;
|
||||
}
|
||||
|
||||
auto *proc_tracker = GetActiveProc();
|
||||
|
||||
if (proc_tracker == nullptr) {
|
||||
return;
|
||||
return true;
|
||||
}
|
||||
|
||||
proc_tracker->Alloc(static_cast<int64_t>(size));
|
||||
return proc_tracker->Alloc(static_cast<int64_t>(size));
|
||||
}
|
||||
void QueryMemoryTracker::TrackFree(size_t size) {
|
||||
if (query_tracker_.has_value()) [[likely]] {
|
||||
|
@ -1,4 +1,4 @@
|
||||
// Copyright 2023 Memgraph Ltd.
|
||||
// Copyright 2024 Memgraph Ltd.
|
||||
//
|
||||
// Use of this software is governed by the Business Source License
|
||||
// included in the file licenses/BSL.txt; by using this file, you agree to be bound by the terms of the Business Source
|
||||
@ -44,7 +44,7 @@ class QueryMemoryTracker {
|
||||
~QueryMemoryTracker() = default;
|
||||
|
||||
// Track allocation on query and procedure if active
|
||||
void TrackAlloc(size_t);
|
||||
bool TrackAlloc(size_t size);
|
||||
|
||||
// Track Free on query and procedure if active
|
||||
void TrackFree(size_t);
|
||||
|
@ -107,6 +107,13 @@ enum class TypeId : uint64_t {
|
||||
COORD_SET_REPL_MAIN_RES,
|
||||
COORD_SWAP_UUID_REQ,
|
||||
COORD_SWAP_UUID_RES,
|
||||
COORD_UNREGISTER_REPLICA_REQ,
|
||||
COORD_UNREGISTER_REPLICA_RES,
|
||||
COORD_ENABLE_WRITING_ON_MAIN_REQ,
|
||||
COORD_ENABLE_WRITING_ON_MAIN_RES,
|
||||
|
||||
COORD_GET_UUID_REQ,
|
||||
COORD_GET_UUID_RES,
|
||||
|
||||
// AST
|
||||
AST_LABELIX = 3000,
|
||||
|
@ -39,8 +39,8 @@ endfunction()
|
||||
|
||||
add_subdirectory(fine_grained_access)
|
||||
add_subdirectory(server)
|
||||
#add_subdirectory(replication)
|
||||
#add_subdirectory(memory)
|
||||
add_subdirectory(replication)
|
||||
add_subdirectory(memory)
|
||||
add_subdirectory(triggers)
|
||||
add_subdirectory(isolation_levels)
|
||||
add_subdirectory(streams)
|
||||
@ -81,10 +81,7 @@ if (MG_EXPERIMENTAL_HIGH_AVAILABILITY)
|
||||
add_subdirectory(high_availability_experimental)
|
||||
endif ()
|
||||
|
||||
|
||||
if (MG_EXPERIMENTAL_REPLICATION_MULTITENANCY)
|
||||
add_subdirectory(replication_experimental)
|
||||
endif ()
|
||||
add_subdirectory(replication_experimental)
|
||||
|
||||
copy_e2e_python_files(pytest_runner pytest_runner.sh "")
|
||||
copy_e2e_python_files(x x.sh "")
|
||||
|
@ -69,6 +69,9 @@ startup_config_dict = {
|
||||
"coordinator_server_port": ("0", "0", "Port on which coordinator servers will be started."),
|
||||
"raft_server_port": ("0", "0", "Port on which raft servers will be started."),
|
||||
"raft_server_id": ("0", "0", "Unique ID of the raft server."),
|
||||
"instance_down_timeout_sec": ("5", "5", "Time duration after which an instance is considered down."),
|
||||
"instance_health_check_frequency_sec": ("1", "1", "The time duration between two health checks/pings."),
|
||||
"instance_get_uuid_frequency_sec": ("10", "10", "The time duration between two instance uuid checks."),
|
||||
"data_directory": ("mg_data", "mg_data", "Path to directory in which to save all permanent data."),
|
||||
"data_recovery_on_startup": (
|
||||
"false",
|
||||
@ -222,4 +225,9 @@ startup_config_dict = {
|
||||
"128",
|
||||
"The threshold for when to cache long delta chains. This is used for heavy read + write workloads where repeated processing of delta chains can become costly.",
|
||||
),
|
||||
"experimental_enabled": (
|
||||
"",
|
||||
"",
|
||||
"Experimental features to be used, comma seperated. Options [system-replication]",
|
||||
),
|
||||
}
|
||||
|
@ -1,10 +1,12 @@
|
||||
find_package(gflags REQUIRED)
|
||||
|
||||
copy_e2e_python_files(ha_experimental coordinator.py)
|
||||
copy_e2e_python_files(ha_experimental automatic_failover.py)
|
||||
copy_e2e_python_files(ha_experimental distributed_coordinators.py)
|
||||
copy_e2e_python_files(ha_experimental single_coordinator.py)
|
||||
copy_e2e_python_files(ha_experimental coord_cluster_registration.py)
|
||||
copy_e2e_python_files(ha_experimental distributed_coords.py)
|
||||
copy_e2e_python_files(ha_experimental manual_setting_replicas.py)
|
||||
copy_e2e_python_files(ha_experimental not_replicate_from_old_main.py)
|
||||
copy_e2e_python_files(ha_experimental disable_writing_on_main_after_restart.py)
|
||||
copy_e2e_python_files(ha_experimental common.py)
|
||||
copy_e2e_python_files(ha_experimental workloads.yaml)
|
||||
|
||||
|
@ -30,3 +30,14 @@ def safe_execute(function, *args):
|
||||
function(*args)
|
||||
except:
|
||||
pass
|
||||
|
||||
|
||||
# NOTE: Repeated execution because it can fail if Raft server is not up
|
||||
def add_coordinator(cursor, query):
|
||||
for _ in range(10):
|
||||
try:
|
||||
execute_and_fetch_all(cursor, query)
|
||||
return True
|
||||
except Exception:
|
||||
pass
|
||||
return False
|
||||
|
@ -0,0 +1,416 @@
|
||||
# Copyright 2022 Memgraph Ltd.
|
||||
#
|
||||
# Use of this software is governed by the Business Source License
|
||||
# included in the file licenses/BSL.txt; by using this file, you agree to be bound by the terms of the Business Source
|
||||
# License, and you may not use this file except in compliance with the Business Source License.
|
||||
#
|
||||
# As of the Change Date specified in that file, in accordance with
|
||||
# the Business Source License, use of this software will be governed
|
||||
# by the Apache License, Version 2.0, included in the file
|
||||
# licenses/APL.txt.
|
||||
|
||||
import os
|
||||
import shutil
|
||||
import sys
|
||||
import tempfile
|
||||
|
||||
import interactive_mg_runner
|
||||
import pytest
|
||||
from common import add_coordinator, connect, execute_and_fetch_all, safe_execute
|
||||
from mg_utils import mg_sleep_and_assert
|
||||
|
||||
interactive_mg_runner.SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__))
|
||||
interactive_mg_runner.PROJECT_DIR = os.path.normpath(
|
||||
os.path.join(interactive_mg_runner.SCRIPT_DIR, "..", "..", "..", "..")
|
||||
)
|
||||
interactive_mg_runner.BUILD_DIR = os.path.normpath(os.path.join(interactive_mg_runner.PROJECT_DIR, "build"))
|
||||
interactive_mg_runner.MEMGRAPH_BINARY = os.path.normpath(os.path.join(interactive_mg_runner.BUILD_DIR, "memgraph"))
|
||||
|
||||
TEMP_DIR = tempfile.TemporaryDirectory().name
|
||||
|
||||
MEMGRAPH_INSTANCES_DESCRIPTION = {
|
||||
"instance_1": {
|
||||
"args": [
|
||||
"--bolt-port",
|
||||
"7687",
|
||||
"--log-level",
|
||||
"TRACE",
|
||||
"--coordinator-server-port",
|
||||
"10011",
|
||||
],
|
||||
"log_file": "instance_1.log",
|
||||
"data_directory": f"{TEMP_DIR}/instance_1",
|
||||
"setup_queries": [],
|
||||
},
|
||||
"instance_2": {
|
||||
"args": [
|
||||
"--bolt-port",
|
||||
"7688",
|
||||
"--log-level",
|
||||
"TRACE",
|
||||
"--coordinator-server-port",
|
||||
"10012",
|
||||
],
|
||||
"log_file": "instance_2.log",
|
||||
"data_directory": f"{TEMP_DIR}/instance_2",
|
||||
"setup_queries": [],
|
||||
},
|
||||
"instance_3": {
|
||||
"args": [
|
||||
"--bolt-port",
|
||||
"7689",
|
||||
"--log-level",
|
||||
"TRACE",
|
||||
"--coordinator-server-port",
|
||||
"10013",
|
||||
],
|
||||
"log_file": "instance_3.log",
|
||||
"data_directory": f"{TEMP_DIR}/instance_3",
|
||||
"setup_queries": [],
|
||||
},
|
||||
"coordinator_1": {
|
||||
"args": [
|
||||
"--bolt-port",
|
||||
"7690",
|
||||
"--log-level=TRACE",
|
||||
"--raft-server-id=1",
|
||||
"--raft-server-port=10111",
|
||||
],
|
||||
"log_file": "coordinator1.log",
|
||||
"setup_queries": [],
|
||||
},
|
||||
"coordinator_2": {
|
||||
"args": [
|
||||
"--bolt-port",
|
||||
"7691",
|
||||
"--log-level=TRACE",
|
||||
"--raft-server-id=2",
|
||||
"--raft-server-port=10112",
|
||||
],
|
||||
"log_file": "coordinator2.log",
|
||||
"setup_queries": [],
|
||||
},
|
||||
"coordinator_3": {
|
||||
"args": [
|
||||
"--bolt-port",
|
||||
"7692",
|
||||
"--log-level=TRACE",
|
||||
"--raft-server-id=3",
|
||||
"--raft-server-port=10113",
|
||||
],
|
||||
"log_file": "coordinator3.log",
|
||||
"setup_queries": [],
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
def test_register_repl_instances_then_coordinators():
|
||||
safe_execute(shutil.rmtree, TEMP_DIR)
|
||||
interactive_mg_runner.start_all(MEMGRAPH_INSTANCES_DESCRIPTION)
|
||||
|
||||
coordinator3_cursor = connect(host="localhost", port=7692).cursor()
|
||||
|
||||
execute_and_fetch_all(
|
||||
coordinator3_cursor, "REGISTER INSTANCE instance_1 ON '127.0.0.1:10011' WITH '127.0.0.1:10001'"
|
||||
)
|
||||
execute_and_fetch_all(
|
||||
coordinator3_cursor, "REGISTER INSTANCE instance_2 ON '127.0.0.1:10012' WITH '127.0.0.1:10002'"
|
||||
)
|
||||
execute_and_fetch_all(
|
||||
coordinator3_cursor, "REGISTER INSTANCE instance_3 ON '127.0.0.1:10013' WITH '127.0.0.1:10003'"
|
||||
)
|
||||
execute_and_fetch_all(coordinator3_cursor, "SET INSTANCE instance_3 TO MAIN")
|
||||
assert add_coordinator(coordinator3_cursor, "ADD COORDINATOR 1 ON '127.0.0.1:10111'")
|
||||
assert add_coordinator(coordinator3_cursor, "ADD COORDINATOR 2 ON '127.0.0.1:10112'")
|
||||
|
||||
def check_coordinator3():
|
||||
return sorted(list(execute_and_fetch_all(coordinator3_cursor, "SHOW INSTANCES")))
|
||||
|
||||
expected_cluster_coord3 = [
|
||||
("coordinator_1", "127.0.0.1:10111", "", True, "coordinator"),
|
||||
("coordinator_2", "127.0.0.1:10112", "", True, "coordinator"),
|
||||
("coordinator_3", "127.0.0.1:10113", "", True, "coordinator"),
|
||||
("instance_1", "", "127.0.0.1:10011", True, "replica"),
|
||||
("instance_2", "", "127.0.0.1:10012", True, "replica"),
|
||||
("instance_3", "", "127.0.0.1:10013", True, "main"),
|
||||
]
|
||||
mg_sleep_and_assert(expected_cluster_coord3, check_coordinator3)
|
||||
|
||||
coordinator1_cursor = connect(host="localhost", port=7690).cursor()
|
||||
|
||||
def check_coordinator1():
|
||||
return sorted(list(execute_and_fetch_all(coordinator1_cursor, "SHOW INSTANCES")))
|
||||
|
||||
# TODO: (andi) This should be solved eventually
|
||||
expected_cluster_not_shared = [
|
||||
("coordinator_1", "127.0.0.1:10111", "", True, "coordinator"),
|
||||
("coordinator_2", "127.0.0.1:10112", "", True, "coordinator"),
|
||||
("coordinator_3", "127.0.0.1:10113", "", True, "coordinator"),
|
||||
]
|
||||
|
||||
mg_sleep_and_assert(expected_cluster_not_shared, check_coordinator1)
|
||||
|
||||
coordinator2_cursor = connect(host="localhost", port=7691).cursor()
|
||||
|
||||
def check_coordinator2():
|
||||
return sorted(list(execute_and_fetch_all(coordinator2_cursor, "SHOW INSTANCES")))
|
||||
|
||||
mg_sleep_and_assert(expected_cluster_not_shared, check_coordinator2)
|
||||
|
||||
|
||||
def test_register_coordinator_then_repl_instances():
|
||||
safe_execute(shutil.rmtree, TEMP_DIR)
|
||||
interactive_mg_runner.start_all(MEMGRAPH_INSTANCES_DESCRIPTION)
|
||||
|
||||
coordinator3_cursor = connect(host="localhost", port=7692).cursor()
|
||||
|
||||
assert add_coordinator(coordinator3_cursor, "ADD COORDINATOR 1 ON '127.0.0.1:10111'")
|
||||
assert add_coordinator(coordinator3_cursor, "ADD COORDINATOR 2 ON '127.0.0.1:10112'")
|
||||
execute_and_fetch_all(
|
||||
coordinator3_cursor, "REGISTER INSTANCE instance_1 ON '127.0.0.1:10011' WITH '127.0.0.1:10001'"
|
||||
)
|
||||
execute_and_fetch_all(
|
||||
coordinator3_cursor, "REGISTER INSTANCE instance_2 ON '127.0.0.1:10012' WITH '127.0.0.1:10002'"
|
||||
)
|
||||
execute_and_fetch_all(
|
||||
coordinator3_cursor, "REGISTER INSTANCE instance_3 ON '127.0.0.1:10013' WITH '127.0.0.1:10003'"
|
||||
)
|
||||
execute_and_fetch_all(coordinator3_cursor, "SET INSTANCE instance_3 TO MAIN")
|
||||
|
||||
def check_coordinator3():
|
||||
return sorted(list(execute_and_fetch_all(coordinator3_cursor, "SHOW INSTANCES")))
|
||||
|
||||
expected_cluster_coord3 = [
|
||||
("coordinator_1", "127.0.0.1:10111", "", True, "coordinator"),
|
||||
("coordinator_2", "127.0.0.1:10112", "", True, "coordinator"),
|
||||
("coordinator_3", "127.0.0.1:10113", "", True, "coordinator"),
|
||||
("instance_1", "", "127.0.0.1:10011", True, "replica"),
|
||||
("instance_2", "", "127.0.0.1:10012", True, "replica"),
|
||||
("instance_3", "", "127.0.0.1:10013", True, "main"),
|
||||
]
|
||||
mg_sleep_and_assert(expected_cluster_coord3, check_coordinator3)
|
||||
|
||||
coordinator1_cursor = connect(host="localhost", port=7690).cursor()
|
||||
|
||||
def check_coordinator1():
|
||||
return sorted(list(execute_and_fetch_all(coordinator1_cursor, "SHOW INSTANCES")))
|
||||
|
||||
# TODO: (andi) This should be solved eventually
|
||||
expected_cluster_not_shared = [
|
||||
("coordinator_1", "127.0.0.1:10111", "", True, "coordinator"),
|
||||
("coordinator_2", "127.0.0.1:10112", "", True, "coordinator"),
|
||||
("coordinator_3", "127.0.0.1:10113", "", True, "coordinator"),
|
||||
]
|
||||
|
||||
mg_sleep_and_assert(expected_cluster_not_shared, check_coordinator1)
|
||||
|
||||
coordinator2_cursor = connect(host="localhost", port=7691).cursor()
|
||||
|
||||
def check_coordinator2():
|
||||
return sorted(list(execute_and_fetch_all(coordinator2_cursor, "SHOW INSTANCES")))
|
||||
|
||||
mg_sleep_and_assert(expected_cluster_not_shared, check_coordinator2)
|
||||
|
||||
|
||||
def test_coordinators_communication_with_restarts():
|
||||
safe_execute(shutil.rmtree, TEMP_DIR)
|
||||
interactive_mg_runner.start_all(MEMGRAPH_INSTANCES_DESCRIPTION)
|
||||
|
||||
coordinator3_cursor = connect(host="localhost", port=7692).cursor()
|
||||
|
||||
assert add_coordinator(coordinator3_cursor, "ADD COORDINATOR 1 ON '127.0.0.1:10111'")
|
||||
assert add_coordinator(coordinator3_cursor, "ADD COORDINATOR 2 ON '127.0.0.1:10112'")
|
||||
execute_and_fetch_all(
|
||||
coordinator3_cursor, "REGISTER INSTANCE instance_1 ON '127.0.0.1:10011' WITH '127.0.0.1:10001'"
|
||||
)
|
||||
execute_and_fetch_all(
|
||||
coordinator3_cursor, "REGISTER INSTANCE instance_2 ON '127.0.0.1:10012' WITH '127.0.0.1:10002'"
|
||||
)
|
||||
execute_and_fetch_all(
|
||||
coordinator3_cursor, "REGISTER INSTANCE instance_3 ON '127.0.0.1:10013' WITH '127.0.0.1:10003'"
|
||||
)
|
||||
execute_and_fetch_all(coordinator3_cursor, "SET INSTANCE instance_3 TO MAIN")
|
||||
|
||||
expected_cluster_not_shared = [
|
||||
("coordinator_1", "127.0.0.1:10111", "", True, "coordinator"),
|
||||
("coordinator_2", "127.0.0.1:10112", "", True, "coordinator"),
|
||||
("coordinator_3", "127.0.0.1:10113", "", True, "coordinator"),
|
||||
]
|
||||
|
||||
coordinator1_cursor = connect(host="localhost", port=7690).cursor()
|
||||
|
||||
def check_coordinator1():
|
||||
return sorted(list(execute_and_fetch_all(coordinator1_cursor, "SHOW INSTANCES")))
|
||||
|
||||
mg_sleep_and_assert(expected_cluster_not_shared, check_coordinator1)
|
||||
|
||||
coordinator2_cursor = connect(host="localhost", port=7691).cursor()
|
||||
|
||||
def check_coordinator2():
|
||||
return sorted(list(execute_and_fetch_all(coordinator2_cursor, "SHOW INSTANCES")))
|
||||
|
||||
mg_sleep_and_assert(expected_cluster_not_shared, check_coordinator2)
|
||||
|
||||
interactive_mg_runner.kill(MEMGRAPH_INSTANCES_DESCRIPTION, "coordinator_1")
|
||||
interactive_mg_runner.start(MEMGRAPH_INSTANCES_DESCRIPTION, "coordinator_1")
|
||||
coordinator1_cursor = connect(host="localhost", port=7690).cursor()
|
||||
|
||||
mg_sleep_and_assert(expected_cluster_not_shared, check_coordinator1)
|
||||
|
||||
interactive_mg_runner.kill(MEMGRAPH_INSTANCES_DESCRIPTION, "coordinator_1")
|
||||
interactive_mg_runner.kill(MEMGRAPH_INSTANCES_DESCRIPTION, "coordinator_2")
|
||||
|
||||
interactive_mg_runner.start(MEMGRAPH_INSTANCES_DESCRIPTION, "coordinator_1")
|
||||
interactive_mg_runner.start(MEMGRAPH_INSTANCES_DESCRIPTION, "coordinator_2")
|
||||
coordinator1_cursor = connect(host="localhost", port=7690).cursor()
|
||||
coordinator2_cursor = connect(host="localhost", port=7691).cursor()
|
||||
|
||||
mg_sleep_and_assert(expected_cluster_not_shared, check_coordinator1)
|
||||
mg_sleep_and_assert(expected_cluster_not_shared, check_coordinator2)
|
||||
|
||||
|
||||
# TODO: (andi) Test when dealing with distributed coordinators that you can register on one coordinator and unregister from any other coordinator
|
||||
@pytest.mark.parametrize(
|
||||
"kill_instance",
|
||||
[True, False],
|
||||
)
|
||||
def test_unregister_replicas(kill_instance):
|
||||
safe_execute(shutil.rmtree, TEMP_DIR)
|
||||
interactive_mg_runner.start_all(MEMGRAPH_INSTANCES_DESCRIPTION)
|
||||
|
||||
coordinator3_cursor = connect(host="localhost", port=7692).cursor()
|
||||
execute_and_fetch_all(
|
||||
coordinator3_cursor, "REGISTER INSTANCE instance_1 ON '127.0.0.1:10011' WITH '127.0.0.1:10001'"
|
||||
)
|
||||
execute_and_fetch_all(
|
||||
coordinator3_cursor, "REGISTER INSTANCE instance_2 ON '127.0.0.1:10012' WITH '127.0.0.1:10002'"
|
||||
)
|
||||
execute_and_fetch_all(
|
||||
coordinator3_cursor, "REGISTER INSTANCE instance_3 ON '127.0.0.1:10013' WITH '127.0.0.1:10003'"
|
||||
)
|
||||
execute_and_fetch_all(coordinator3_cursor, "SET INSTANCE instance_3 TO MAIN")
|
||||
|
||||
def check_coordinator3():
|
||||
return sorted(list(execute_and_fetch_all(coordinator3_cursor, "SHOW INSTANCES")))
|
||||
|
||||
main_cursor = connect(host="localhost", port=7689).cursor()
|
||||
|
||||
def check_main():
|
||||
return sorted(list(execute_and_fetch_all(main_cursor, "SHOW REPLICAS")))
|
||||
|
||||
expected_cluster = [
|
||||
("coordinator_3", "127.0.0.1:10113", "", True, "coordinator"),
|
||||
("instance_1", "", "127.0.0.1:10011", True, "replica"),
|
||||
("instance_2", "", "127.0.0.1:10012", True, "replica"),
|
||||
("instance_3", "", "127.0.0.1:10013", True, "main"),
|
||||
]
|
||||
|
||||
expected_replicas = [
|
||||
("instance_1", "127.0.0.1:10001", "sync", 0, 0, "ready"),
|
||||
("instance_2", "127.0.0.1:10002", "sync", 0, 0, "ready"),
|
||||
]
|
||||
|
||||
mg_sleep_and_assert(expected_cluster, check_coordinator3)
|
||||
mg_sleep_and_assert(expected_replicas, check_main)
|
||||
|
||||
if kill_instance:
|
||||
interactive_mg_runner.kill(MEMGRAPH_INSTANCES_DESCRIPTION, "instance_1")
|
||||
execute_and_fetch_all(coordinator3_cursor, "UNREGISTER INSTANCE instance_1")
|
||||
|
||||
expected_cluster = [
|
||||
("coordinator_3", "127.0.0.1:10113", "", True, "coordinator"),
|
||||
("instance_2", "", "127.0.0.1:10012", True, "replica"),
|
||||
("instance_3", "", "127.0.0.1:10013", True, "main"),
|
||||
]
|
||||
|
||||
expected_replicas = [
|
||||
("instance_2", "127.0.0.1:10002", "sync", 0, 0, "ready"),
|
||||
]
|
||||
|
||||
mg_sleep_and_assert(expected_cluster, check_coordinator3)
|
||||
mg_sleep_and_assert(expected_replicas, check_main)
|
||||
|
||||
if kill_instance:
|
||||
interactive_mg_runner.kill(MEMGRAPH_INSTANCES_DESCRIPTION, "instance_2")
|
||||
execute_and_fetch_all(coordinator3_cursor, "UNREGISTER INSTANCE instance_2")
|
||||
|
||||
expected_cluster = [
|
||||
("coordinator_3", "127.0.0.1:10113", "", True, "coordinator"),
|
||||
("instance_3", "", "127.0.0.1:10013", True, "main"),
|
||||
]
|
||||
expected_replicas = []
|
||||
|
||||
mg_sleep_and_assert(expected_cluster, check_coordinator3)
|
||||
mg_sleep_and_assert(expected_replicas, check_main)
|
||||
|
||||
|
||||
def test_unregister_main():
|
||||
safe_execute(shutil.rmtree, TEMP_DIR)
|
||||
interactive_mg_runner.start_all(MEMGRAPH_INSTANCES_DESCRIPTION)
|
||||
|
||||
coordinator3_cursor = connect(host="localhost", port=7692).cursor()
|
||||
execute_and_fetch_all(
|
||||
coordinator3_cursor, "REGISTER INSTANCE instance_1 ON '127.0.0.1:10011' WITH '127.0.0.1:10001'"
|
||||
)
|
||||
execute_and_fetch_all(
|
||||
coordinator3_cursor, "REGISTER INSTANCE instance_2 ON '127.0.0.1:10012' WITH '127.0.0.1:10002'"
|
||||
)
|
||||
execute_and_fetch_all(
|
||||
coordinator3_cursor, "REGISTER INSTANCE instance_3 ON '127.0.0.1:10013' WITH '127.0.0.1:10003'"
|
||||
)
|
||||
execute_and_fetch_all(coordinator3_cursor, "SET INSTANCE instance_3 TO MAIN")
|
||||
|
||||
def check_coordinator3():
|
||||
return sorted(list(execute_and_fetch_all(coordinator3_cursor, "SHOW INSTANCES")))
|
||||
|
||||
expected_cluster = [
|
||||
("coordinator_3", "127.0.0.1:10113", "", True, "coordinator"),
|
||||
("instance_1", "", "127.0.0.1:10011", True, "replica"),
|
||||
("instance_2", "", "127.0.0.1:10012", True, "replica"),
|
||||
("instance_3", "", "127.0.0.1:10013", True, "main"),
|
||||
]
|
||||
|
||||
mg_sleep_and_assert(expected_cluster, check_coordinator3)
|
||||
|
||||
try:
|
||||
execute_and_fetch_all(coordinator3_cursor, "UNREGISTER INSTANCE instance_3")
|
||||
except Exception as e:
|
||||
assert (
|
||||
str(e)
|
||||
== "Alive main instance can't be unregistered! Shut it down to trigger failover and then unregister it!"
|
||||
)
|
||||
|
||||
interactive_mg_runner.kill(MEMGRAPH_INSTANCES_DESCRIPTION, "instance_3")
|
||||
|
||||
expected_cluster = [
|
||||
("coordinator_3", "127.0.0.1:10113", "", True, "coordinator"),
|
||||
("instance_1", "", "127.0.0.1:10011", True, "main"),
|
||||
("instance_2", "", "127.0.0.1:10012", True, "replica"),
|
||||
("instance_3", "", "127.0.0.1:10013", False, "unknown"),
|
||||
]
|
||||
|
||||
mg_sleep_and_assert(expected_cluster, check_coordinator3)
|
||||
|
||||
execute_and_fetch_all(coordinator3_cursor, "UNREGISTER INSTANCE instance_3")
|
||||
|
||||
expected_cluster = [
|
||||
("coordinator_3", "127.0.0.1:10113", "", True, "coordinator"),
|
||||
("instance_1", "", "127.0.0.1:10011", True, "main"),
|
||||
("instance_2", "", "127.0.0.1:10012", True, "replica"),
|
||||
]
|
||||
|
||||
expected_replicas = [
|
||||
("instance_2", "127.0.0.1:10002", "sync", 0, 0, "ready"),
|
||||
]
|
||||
|
||||
main_cursor = connect(host="localhost", port=7687).cursor()
|
||||
|
||||
def check_main():
|
||||
return sorted(list(execute_and_fetch_all(main_cursor, "SHOW REPLICAS")))
|
||||
|
||||
mg_sleep_and_assert(expected_cluster, check_coordinator3)
|
||||
mg_sleep_and_assert(expected_replicas, check_main)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(pytest.main([__file__, "-rA"]))
|
@ -0,0 +1,181 @@
|
||||
# Copyright 2022 Memgraph Ltd.
|
||||
#
|
||||
# Use of this software is governed by the Business Source License
|
||||
# included in the file licenses/BSL.txt; by using this file, you agree to be bound by the terms of the Business Source
|
||||
# License, and you may not use this file except in compliance with the Business Source License.
|
||||
#
|
||||
# As of the Change Date specified in that file, in accordance with
|
||||
# the Business Source License, use of this software will be governed
|
||||
# by the Apache License, Version 2.0, included in the file
|
||||
# licenses/APL.txt.
|
||||
|
||||
import os
|
||||
import shutil
|
||||
import sys
|
||||
import tempfile
|
||||
|
||||
import interactive_mg_runner
|
||||
import pytest
|
||||
from common import add_coordinator, connect, execute_and_fetch_all, safe_execute
|
||||
from mg_utils import mg_sleep_and_assert
|
||||
|
||||
interactive_mg_runner.SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__))
|
||||
interactive_mg_runner.PROJECT_DIR = os.path.normpath(
|
||||
os.path.join(interactive_mg_runner.SCRIPT_DIR, "..", "..", "..", "..")
|
||||
)
|
||||
interactive_mg_runner.BUILD_DIR = os.path.normpath(os.path.join(interactive_mg_runner.PROJECT_DIR, "build"))
|
||||
interactive_mg_runner.MEMGRAPH_BINARY = os.path.normpath(os.path.join(interactive_mg_runner.BUILD_DIR, "memgraph"))
|
||||
|
||||
TEMP_DIR = tempfile.TemporaryDirectory().name
|
||||
|
||||
MEMGRAPH_INSTANCES_DESCRIPTION = {
|
||||
"instance_1": {
|
||||
"args": [
|
||||
"--bolt-port",
|
||||
"7687",
|
||||
"--log-level",
|
||||
"TRACE",
|
||||
"--coordinator-server-port",
|
||||
"10011",
|
||||
"--also-log-to-stderr",
|
||||
"--instance-health-check-frequency-sec",
|
||||
"1",
|
||||
"--instance-down-timeout-sec",
|
||||
"5",
|
||||
],
|
||||
"log_file": "instance_1.log",
|
||||
"data_directory": f"{TEMP_DIR}/instance_1",
|
||||
"setup_queries": [],
|
||||
},
|
||||
"instance_2": {
|
||||
"args": [
|
||||
"--bolt-port",
|
||||
"7688",
|
||||
"--log-level",
|
||||
"TRACE",
|
||||
"--coordinator-server-port",
|
||||
"10012",
|
||||
"--also-log-to-stderr",
|
||||
"--instance-health-check-frequency-sec",
|
||||
"1",
|
||||
"--instance-down-timeout-sec",
|
||||
"5",
|
||||
],
|
||||
"log_file": "instance_2.log",
|
||||
"data_directory": f"{TEMP_DIR}/instance_2",
|
||||
"setup_queries": [],
|
||||
},
|
||||
"instance_3": {
|
||||
"args": [
|
||||
"--bolt-port",
|
||||
"7689",
|
||||
"--log-level",
|
||||
"TRACE",
|
||||
"--coordinator-server-port",
|
||||
"10013",
|
||||
"--also-log-to-stderr",
|
||||
"--instance-health-check-frequency-sec",
|
||||
"5",
|
||||
"--instance-down-timeout-sec",
|
||||
"10",
|
||||
],
|
||||
"log_file": "instance_3.log",
|
||||
"data_directory": f"{TEMP_DIR}/instance_3",
|
||||
"setup_queries": [],
|
||||
},
|
||||
"coordinator_1": {
|
||||
"args": [
|
||||
"--bolt-port",
|
||||
"7690",
|
||||
"--log-level=TRACE",
|
||||
"--raft-server-id=1",
|
||||
"--raft-server-port=10111",
|
||||
],
|
||||
"log_file": "coordinator1.log",
|
||||
"setup_queries": [],
|
||||
},
|
||||
"coordinator_2": {
|
||||
"args": [
|
||||
"--bolt-port",
|
||||
"7691",
|
||||
"--log-level=TRACE",
|
||||
"--raft-server-id=2",
|
||||
"--raft-server-port=10112",
|
||||
],
|
||||
"log_file": "coordinator2.log",
|
||||
"setup_queries": [],
|
||||
},
|
||||
"coordinator_3": {
|
||||
"args": [
|
||||
"--bolt-port",
|
||||
"7692",
|
||||
"--log-level=TRACE",
|
||||
"--raft-server-id=3",
|
||||
"--raft-server-port=10113",
|
||||
"--also-log-to-stderr",
|
||||
],
|
||||
"log_file": "coordinator3.log",
|
||||
"setup_queries": [],
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
def test_writing_disabled_on_main_restart():
|
||||
safe_execute(shutil.rmtree, TEMP_DIR)
|
||||
interactive_mg_runner.start_all(MEMGRAPH_INSTANCES_DESCRIPTION)
|
||||
|
||||
coordinator3_cursor = connect(host="localhost", port=7692).cursor()
|
||||
|
||||
execute_and_fetch_all(
|
||||
coordinator3_cursor, "REGISTER INSTANCE instance_3 ON '127.0.0.1:10013' WITH '127.0.0.1:10003'"
|
||||
)
|
||||
execute_and_fetch_all(coordinator3_cursor, "SET INSTANCE instance_3 TO MAIN")
|
||||
assert add_coordinator(coordinator3_cursor, "ADD COORDINATOR 1 ON '127.0.0.1:10111'")
|
||||
assert add_coordinator(coordinator3_cursor, "ADD COORDINATOR 2 ON '127.0.0.1:10112'")
|
||||
|
||||
def check_coordinator3():
|
||||
return sorted(list(execute_and_fetch_all(coordinator3_cursor, "SHOW INSTANCES")))
|
||||
|
||||
expected_cluster_coord3 = [
|
||||
("coordinator_1", "127.0.0.1:10111", "", True, "coordinator"),
|
||||
("coordinator_2", "127.0.0.1:10112", "", True, "coordinator"),
|
||||
("coordinator_3", "127.0.0.1:10113", "", True, "coordinator"),
|
||||
("instance_3", "", "127.0.0.1:10013", True, "main"),
|
||||
]
|
||||
mg_sleep_and_assert(expected_cluster_coord3, check_coordinator3)
|
||||
|
||||
interactive_mg_runner.kill(MEMGRAPH_INSTANCES_DESCRIPTION, "instance_3")
|
||||
|
||||
expected_cluster_coord3 = [
|
||||
("coordinator_1", "127.0.0.1:10111", "", True, "coordinator"),
|
||||
("coordinator_2", "127.0.0.1:10112", "", True, "coordinator"),
|
||||
("coordinator_3", "127.0.0.1:10113", "", True, "coordinator"),
|
||||
("instance_3", "", "127.0.0.1:10013", False, "unknown"),
|
||||
]
|
||||
|
||||
mg_sleep_and_assert(expected_cluster_coord3, check_coordinator3)
|
||||
|
||||
interactive_mg_runner.start(MEMGRAPH_INSTANCES_DESCRIPTION, "instance_3")
|
||||
|
||||
try:
|
||||
instance3_cursor = connect(host="localhost", port=7689).cursor()
|
||||
execute_and_fetch_all(instance3_cursor, "CREATE (n:Node {name: 'node'})")
|
||||
except Exception as e:
|
||||
assert (
|
||||
str(e)
|
||||
== "Write query forbidden on the main! Coordinator needs to enable writing on main by sending RPC message."
|
||||
)
|
||||
|
||||
expected_cluster_coord3 = [
|
||||
("coordinator_1", "127.0.0.1:10111", "", True, "coordinator"),
|
||||
("coordinator_2", "127.0.0.1:10112", "", True, "coordinator"),
|
||||
("coordinator_3", "127.0.0.1:10113", "", True, "coordinator"),
|
||||
("instance_3", "", "127.0.0.1:10013", True, "main"),
|
||||
]
|
||||
|
||||
mg_sleep_and_assert(expected_cluster_coord3, check_coordinator3)
|
||||
execute_and_fetch_all(instance3_cursor, "CREATE (n:Node {name: 'node'})")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(pytest.main([__file__, "-rA"]))
|
@ -1,145 +0,0 @@
|
||||
# Copyright 2022 Memgraph Ltd.
|
||||
#
|
||||
# Use of this software is governed by the Business Source License
|
||||
# included in the file licenses/BSL.txt; by using this file, you agree to be bound by the terms of the Business Source
|
||||
# License, and you may not use this file except in compliance with the Business Source License.
|
||||
#
|
||||
# As of the Change Date specified in that file, in accordance with
|
||||
# the Business Source License, use of this software will be governed
|
||||
# by the Apache License, Version 2.0, included in the file
|
||||
# licenses/APL.txt.
|
||||
|
||||
import os
|
||||
import shutil
|
||||
import sys
|
||||
import tempfile
|
||||
|
||||
import interactive_mg_runner
|
||||
import pytest
|
||||
from common import connect, execute_and_fetch_all, safe_execute
|
||||
from mg_utils import mg_sleep_and_assert
|
||||
|
||||
interactive_mg_runner.SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__))
|
||||
interactive_mg_runner.PROJECT_DIR = os.path.normpath(
|
||||
os.path.join(interactive_mg_runner.SCRIPT_DIR, "..", "..", "..", "..")
|
||||
)
|
||||
interactive_mg_runner.BUILD_DIR = os.path.normpath(os.path.join(interactive_mg_runner.PROJECT_DIR, "build"))
|
||||
interactive_mg_runner.MEMGRAPH_BINARY = os.path.normpath(os.path.join(interactive_mg_runner.BUILD_DIR, "memgraph"))
|
||||
|
||||
TEMP_DIR = tempfile.TemporaryDirectory().name
|
||||
|
||||
MEMGRAPH_INSTANCES_DESCRIPTION = {
|
||||
"coordinator1": {
|
||||
"args": [
|
||||
"--bolt-port",
|
||||
"7687",
|
||||
"--log-level=TRACE",
|
||||
"--raft-server-id=1",
|
||||
"--raft-server-port=10111",
|
||||
],
|
||||
"log_file": "coordinator1.log",
|
||||
"setup_queries": [],
|
||||
},
|
||||
"coordinator2": {
|
||||
"args": [
|
||||
"--bolt-port",
|
||||
"7688",
|
||||
"--log-level=TRACE",
|
||||
"--raft-server-id=2",
|
||||
"--raft-server-port=10112",
|
||||
],
|
||||
"log_file": "coordinator2.log",
|
||||
"setup_queries": [],
|
||||
},
|
||||
"coordinator3": {
|
||||
"args": [
|
||||
"--bolt-port",
|
||||
"7689",
|
||||
"--log-level=TRACE",
|
||||
"--raft-server-id=3",
|
||||
"--raft-server-port=10113",
|
||||
],
|
||||
"log_file": "coordinator3.log",
|
||||
"setup_queries": [
|
||||
"ADD COORDINATOR 1 ON '127.0.0.1:10111'",
|
||||
"ADD COORDINATOR 2 ON '127.0.0.1:10112'",
|
||||
],
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
def test_coordinators_communication():
|
||||
safe_execute(shutil.rmtree, TEMP_DIR)
|
||||
interactive_mg_runner.start_all(MEMGRAPH_INSTANCES_DESCRIPTION)
|
||||
|
||||
coordinator3_cursor = connect(host="localhost", port=7689).cursor()
|
||||
|
||||
def check_coordinator3():
|
||||
return sorted(list(execute_and_fetch_all(coordinator3_cursor, "SHOW INSTANCES")))
|
||||
|
||||
expected_cluster = [
|
||||
("coordinator_1", "127.0.0.1:10111", "", True, "coordinator"),
|
||||
("coordinator_2", "127.0.0.1:10112", "", True, "coordinator"),
|
||||
("coordinator_3", "127.0.0.1:10113", "", True, "coordinator"),
|
||||
]
|
||||
mg_sleep_and_assert(expected_cluster, check_coordinator3)
|
||||
|
||||
coordinator1_cursor = connect(host="localhost", port=7687).cursor()
|
||||
|
||||
def check_coordinator1():
|
||||
return sorted(list(execute_and_fetch_all(coordinator1_cursor, "SHOW INSTANCES")))
|
||||
|
||||
mg_sleep_and_assert(expected_cluster, check_coordinator1)
|
||||
|
||||
coordinator2_cursor = connect(host="localhost", port=7688).cursor()
|
||||
|
||||
def check_coordinator2():
|
||||
return sorted(list(execute_and_fetch_all(coordinator2_cursor, "SHOW INSTANCES")))
|
||||
|
||||
mg_sleep_and_assert(expected_cluster, check_coordinator2)
|
||||
|
||||
|
||||
def test_coordinators_communication_with_restarts():
|
||||
safe_execute(shutil.rmtree, TEMP_DIR)
|
||||
interactive_mg_runner.start_all(MEMGRAPH_INSTANCES_DESCRIPTION)
|
||||
|
||||
expected_cluster = [
|
||||
("coordinator_1", "127.0.0.1:10111", "", True, "coordinator"),
|
||||
("coordinator_2", "127.0.0.1:10112", "", True, "coordinator"),
|
||||
("coordinator_3", "127.0.0.1:10113", "", True, "coordinator"),
|
||||
]
|
||||
|
||||
coordinator1_cursor = connect(host="localhost", port=7687).cursor()
|
||||
|
||||
def check_coordinator1():
|
||||
return sorted(list(execute_and_fetch_all(coordinator1_cursor, "SHOW INSTANCES")))
|
||||
|
||||
mg_sleep_and_assert(expected_cluster, check_coordinator1)
|
||||
|
||||
coordinator2_cursor = connect(host="localhost", port=7688).cursor()
|
||||
|
||||
def check_coordinator2():
|
||||
return sorted(list(execute_and_fetch_all(coordinator2_cursor, "SHOW INSTANCES")))
|
||||
|
||||
mg_sleep_and_assert(expected_cluster, check_coordinator2)
|
||||
|
||||
interactive_mg_runner.kill(MEMGRAPH_INSTANCES_DESCRIPTION, "coordinator1")
|
||||
interactive_mg_runner.start(MEMGRAPH_INSTANCES_DESCRIPTION, "coordinator1")
|
||||
coordinator1_cursor = connect(host="localhost", port=7687).cursor()
|
||||
|
||||
mg_sleep_and_assert(expected_cluster, check_coordinator1)
|
||||
|
||||
interactive_mg_runner.kill(MEMGRAPH_INSTANCES_DESCRIPTION, "coordinator1")
|
||||
interactive_mg_runner.kill(MEMGRAPH_INSTANCES_DESCRIPTION, "coordinator2")
|
||||
|
||||
interactive_mg_runner.start(MEMGRAPH_INSTANCES_DESCRIPTION, "coordinator1")
|
||||
interactive_mg_runner.start(MEMGRAPH_INSTANCES_DESCRIPTION, "coordinator2")
|
||||
coordinator1_cursor = connect(host="localhost", port=7687).cursor()
|
||||
coordinator2_cursor = connect(host="localhost", port=7688).cursor()
|
||||
|
||||
mg_sleep_and_assert(expected_cluster, check_coordinator1)
|
||||
mg_sleep_and_assert(expected_cluster, check_coordinator2)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(pytest.main([__file__, "-rA"]))
|
164
tests/e2e/high_availability_experimental/distributed_coords.py
Normal file
164
tests/e2e/high_availability_experimental/distributed_coords.py
Normal file
@ -0,0 +1,164 @@
|
||||
# Copyright 2022 Memgraph Ltd.
|
||||
#
|
||||
# Use of this software is governed by the Business Source License
|
||||
# included in the file licenses/BSL.txt; by using this file, you agree to be bound by the terms of the Business Source
|
||||
# License, and you may not use this file except in compliance with the Business Source License.
|
||||
#
|
||||
# As of the Change Date specified in that file, in accordance with
|
||||
# the Business Source License, use of this software will be governed
|
||||
# by the Apache License, Version 2.0, included in the file
|
||||
# licenses/APL.txt.
|
||||
|
||||
import os
|
||||
import shutil
|
||||
import sys
|
||||
import tempfile
|
||||
|
||||
import interactive_mg_runner
|
||||
import pytest
|
||||
from common import connect, execute_and_fetch_all, safe_execute
|
||||
from mg_utils import mg_sleep_and_assert
|
||||
|
||||
interactive_mg_runner.SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__))
|
||||
interactive_mg_runner.PROJECT_DIR = os.path.normpath(
|
||||
os.path.join(interactive_mg_runner.SCRIPT_DIR, "..", "..", "..", "..")
|
||||
)
|
||||
interactive_mg_runner.BUILD_DIR = os.path.normpath(os.path.join(interactive_mg_runner.PROJECT_DIR, "build"))
|
||||
interactive_mg_runner.MEMGRAPH_BINARY = os.path.normpath(os.path.join(interactive_mg_runner.BUILD_DIR, "memgraph"))
|
||||
|
||||
TEMP_DIR = tempfile.TemporaryDirectory().name
|
||||
|
||||
MEMGRAPH_INSTANCES_DESCRIPTION = {
|
||||
"instance_1": {
|
||||
"args": [
|
||||
"--bolt-port",
|
||||
"7687",
|
||||
"--log-level",
|
||||
"TRACE",
|
||||
"--coordinator-server-port",
|
||||
"10011",
|
||||
],
|
||||
"log_file": "instance_1.log",
|
||||
"data_directory": f"{TEMP_DIR}/instance_1",
|
||||
"setup_queries": [],
|
||||
},
|
||||
"instance_2": {
|
||||
"args": [
|
||||
"--bolt-port",
|
||||
"7688",
|
||||
"--log-level",
|
||||
"TRACE",
|
||||
"--coordinator-server-port",
|
||||
"10012",
|
||||
],
|
||||
"log_file": "instance_2.log",
|
||||
"data_directory": f"{TEMP_DIR}/instance_2",
|
||||
"setup_queries": [],
|
||||
},
|
||||
"instance_3": {
|
||||
"args": [
|
||||
"--bolt-port",
|
||||
"7689",
|
||||
"--log-level",
|
||||
"TRACE",
|
||||
"--coordinator-server-port",
|
||||
"10013",
|
||||
],
|
||||
"log_file": "instance_3.log",
|
||||
"data_directory": f"{TEMP_DIR}/instance_3",
|
||||
"setup_queries": [],
|
||||
},
|
||||
"coordinator_1": {
|
||||
"args": [
|
||||
"--bolt-port",
|
||||
"7690",
|
||||
"--log-level=TRACE",
|
||||
"--raft-server-id=1",
|
||||
"--raft-server-port=10111",
|
||||
],
|
||||
"log_file": "coordinator1.log",
|
||||
"setup_queries": [],
|
||||
},
|
||||
"coordinator_2": {
|
||||
"args": [
|
||||
"--bolt-port",
|
||||
"7691",
|
||||
"--log-level=TRACE",
|
||||
"--raft-server-id=2",
|
||||
"--raft-server-port=10112",
|
||||
],
|
||||
"log_file": "coordinator2.log",
|
||||
"setup_queries": [],
|
||||
},
|
||||
"coordinator_3": {
|
||||
"args": [
|
||||
"--bolt-port",
|
||||
"7692",
|
||||
"--log-level=TRACE",
|
||||
"--raft-server-id=3",
|
||||
"--raft-server-port=10113",
|
||||
],
|
||||
"log_file": "coordinator3.log",
|
||||
"setup_queries": [
|
||||
"ADD COORDINATOR 1 ON '127.0.0.1:10111'",
|
||||
"ADD COORDINATOR 2 ON '127.0.0.1:10112'",
|
||||
"REGISTER INSTANCE instance_1 ON '127.0.0.1:10011' WITH '127.0.0.1:10001'",
|
||||
"REGISTER INSTANCE instance_2 ON '127.0.0.1:10012' WITH '127.0.0.1:10002'",
|
||||
"REGISTER INSTANCE instance_3 ON '127.0.0.1:10013' WITH '127.0.0.1:10003'",
|
||||
"SET INSTANCE instance_3 TO MAIN",
|
||||
],
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
def test_distributed_automatic_failover():
|
||||
safe_execute(shutil.rmtree, TEMP_DIR)
|
||||
interactive_mg_runner.start_all(MEMGRAPH_INSTANCES_DESCRIPTION)
|
||||
|
||||
main_cursor = connect(host="localhost", port=7689).cursor()
|
||||
expected_data_on_main = [
|
||||
("instance_1", "127.0.0.1:10001", "sync", 0, 0, "ready"),
|
||||
("instance_2", "127.0.0.1:10002", "sync", 0, 0, "ready"),
|
||||
]
|
||||
actual_data_on_main = sorted(list(execute_and_fetch_all(main_cursor, "SHOW REPLICAS;")))
|
||||
assert actual_data_on_main == expected_data_on_main
|
||||
|
||||
interactive_mg_runner.kill(MEMGRAPH_INSTANCES_DESCRIPTION, "instance_3")
|
||||
|
||||
coord_cursor = connect(host="localhost", port=7692).cursor()
|
||||
|
||||
def retrieve_data_show_repl_cluster():
|
||||
return sorted(list(execute_and_fetch_all(coord_cursor, "SHOW INSTANCES;")))
|
||||
|
||||
expected_data_on_coord = [
|
||||
("coordinator_1", "127.0.0.1:10111", "", True, "coordinator"),
|
||||
("coordinator_2", "127.0.0.1:10112", "", True, "coordinator"),
|
||||
("coordinator_3", "127.0.0.1:10113", "", True, "coordinator"),
|
||||
("instance_1", "", "127.0.0.1:10011", True, "main"),
|
||||
("instance_2", "", "127.0.0.1:10012", True, "replica"),
|
||||
("instance_3", "", "127.0.0.1:10013", False, "unknown"),
|
||||
]
|
||||
mg_sleep_and_assert(expected_data_on_coord, retrieve_data_show_repl_cluster)
|
||||
|
||||
new_main_cursor = connect(host="localhost", port=7687).cursor()
|
||||
|
||||
def retrieve_data_show_replicas():
|
||||
return sorted(list(execute_and_fetch_all(new_main_cursor, "SHOW REPLICAS;")))
|
||||
|
||||
expected_data_on_new_main = [
|
||||
("instance_2", "127.0.0.1:10002", "sync", 0, 0, "ready"),
|
||||
("instance_3", "127.0.0.1:10003", "sync", 0, 0, "invalid"),
|
||||
]
|
||||
mg_sleep_and_assert(expected_data_on_new_main, retrieve_data_show_replicas)
|
||||
|
||||
interactive_mg_runner.start(MEMGRAPH_INSTANCES_DESCRIPTION, "instance_3")
|
||||
expected_data_on_new_main_old_alive = [
|
||||
("instance_2", "127.0.0.1:10002", "sync", 0, 0, "ready"),
|
||||
("instance_3", "127.0.0.1:10003", "sync", 0, 0, "ready"),
|
||||
]
|
||||
|
||||
mg_sleep_and_assert(expected_data_on_new_main_old_alive, retrieve_data_show_replicas)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(pytest.main([__file__, "-rA"]))
|
@ -10,11 +10,13 @@
|
||||
# licenses/APL.txt.
|
||||
|
||||
import os
|
||||
import shutil
|
||||
import sys
|
||||
import tempfile
|
||||
|
||||
import interactive_mg_runner
|
||||
import pytest
|
||||
from common import execute_and_fetch_all
|
||||
from common import execute_and_fetch_all, safe_execute
|
||||
from mg_utils import mg_sleep_and_assert
|
||||
|
||||
interactive_mg_runner.SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__))
|
||||
@ -38,7 +40,7 @@ MEMGRAPH_FIRST_CLUSTER_DESCRIPTION = {
|
||||
}
|
||||
|
||||
|
||||
MEMGRAPH_INSTANCES_DESCRIPTION = {
|
||||
MEMGRAPH_SECOND_CLUSTER_DESCRIPTION = {
|
||||
"replica": {
|
||||
"args": ["--bolt-port", "7689", "--log-level", "TRACE"],
|
||||
"log_file": "replica.log",
|
||||
@ -71,7 +73,7 @@ def test_replication_works_on_failover(connection):
|
||||
assert actual_data_on_main == expected_data_on_main
|
||||
|
||||
# 3
|
||||
interactive_mg_runner.start_all_keep_others(MEMGRAPH_INSTANCES_DESCRIPTION)
|
||||
interactive_mg_runner.start_all_keep_others(MEMGRAPH_SECOND_CLUSTER_DESCRIPTION)
|
||||
|
||||
# 4
|
||||
new_main_cursor = connection(7690, "main_2").cursor()
|
||||
@ -113,5 +115,144 @@ def test_replication_works_on_failover(connection):
|
||||
interactive_mg_runner.stop_all()
|
||||
|
||||
|
||||
def test_not_replicate_old_main_register_new_cluster(connection):
|
||||
# Goal of this test is to check that although replica is registered in one cluster
|
||||
# it can be re-registered to new cluster
|
||||
# This flow checks if Registering replica is idempotent and that old main cannot talk to replica
|
||||
# 1. We start all replicas and main in one cluster
|
||||
# 2. Main from first cluster can see all replicas
|
||||
# 3. We start all replicas and main in second cluster, by reusing one replica from first cluster
|
||||
# 4. New main should see replica. Registration should pass (idempotent registration)
|
||||
# 5. Old main should not talk to new replica
|
||||
# 6. New main should talk to replica
|
||||
|
||||
TEMP_DIR = tempfile.TemporaryDirectory().name
|
||||
MEMGRAPH_FISRT_COORD_CLUSTER_DESCRIPTION = {
|
||||
"shared_instance": {
|
||||
"args": [
|
||||
"--bolt-port",
|
||||
"7688",
|
||||
"--log-level",
|
||||
"TRACE",
|
||||
"--coordinator-server-port",
|
||||
"10011",
|
||||
],
|
||||
"log_file": "instance_1.log",
|
||||
"data_directory": f"{TEMP_DIR}/shared_instance",
|
||||
"setup_queries": [],
|
||||
},
|
||||
"instance_2": {
|
||||
"args": [
|
||||
"--bolt-port",
|
||||
"7689",
|
||||
"--log-level",
|
||||
"TRACE",
|
||||
"--coordinator-server-port",
|
||||
"10012",
|
||||
],
|
||||
"log_file": "instance_2.log",
|
||||
"data_directory": f"{TEMP_DIR}/instance_2",
|
||||
"setup_queries": [],
|
||||
},
|
||||
"coordinator_1": {
|
||||
"args": ["--bolt-port", "7690", "--log-level=TRACE", "--raft-server-id=1", "--raft-server-port=10111"],
|
||||
"log_file": "coordinator.log",
|
||||
"setup_queries": [
|
||||
"REGISTER INSTANCE shared_instance ON '127.0.0.1:10011' WITH '127.0.0.1:10001';",
|
||||
"REGISTER INSTANCE instance_2 ON '127.0.0.1:10012' WITH '127.0.0.1:10002';",
|
||||
"SET INSTANCE instance_2 TO MAIN",
|
||||
],
|
||||
},
|
||||
}
|
||||
|
||||
# 1
|
||||
interactive_mg_runner.start_all_keep_others(MEMGRAPH_FISRT_COORD_CLUSTER_DESCRIPTION)
|
||||
|
||||
# 2
|
||||
|
||||
first_cluster_coord_cursor = connection(7690, "coord_1").cursor()
|
||||
|
||||
def show_repl_cluster():
|
||||
return sorted(list(execute_and_fetch_all(first_cluster_coord_cursor, "SHOW INSTANCES;")))
|
||||
|
||||
expected_data_up_first_cluster = [
|
||||
("coordinator_1", "127.0.0.1:10111", "", True, "coordinator"),
|
||||
("instance_2", "", "127.0.0.1:10012", True, "main"),
|
||||
("shared_instance", "", "127.0.0.1:10011", True, "replica"),
|
||||
]
|
||||
|
||||
mg_sleep_and_assert(expected_data_up_first_cluster, show_repl_cluster)
|
||||
|
||||
# 3
|
||||
|
||||
MEMGRAPH_SECOND_COORD_CLUSTER_DESCRIPTION = {
|
||||
"instance_3": {
|
||||
"args": [
|
||||
"--bolt-port",
|
||||
"7687",
|
||||
"--log-level",
|
||||
"TRACE",
|
||||
"--coordinator-server-port",
|
||||
"10013",
|
||||
],
|
||||
"log_file": "instance_3.log",
|
||||
"data_directory": f"{TEMP_DIR}/instance_3",
|
||||
"setup_queries": [],
|
||||
},
|
||||
"coordinator_2": {
|
||||
"args": ["--bolt-port", "7691", "--log-level=TRACE", "--raft-server-id=1", "--raft-server-port=10112"],
|
||||
"log_file": "coordinator.log",
|
||||
"setup_queries": [],
|
||||
},
|
||||
}
|
||||
|
||||
interactive_mg_runner.start_all_keep_others(MEMGRAPH_SECOND_COORD_CLUSTER_DESCRIPTION)
|
||||
second_cluster_coord_cursor = connection(7691, "coord_2").cursor()
|
||||
execute_and_fetch_all(
|
||||
second_cluster_coord_cursor, "REGISTER INSTANCE shared_instance ON '127.0.0.1:10011' WITH '127.0.0.1:10001';"
|
||||
)
|
||||
execute_and_fetch_all(
|
||||
second_cluster_coord_cursor, "REGISTER INSTANCE instance_3 ON '127.0.0.1:10013' WITH '127.0.0.1:10003';"
|
||||
)
|
||||
execute_and_fetch_all(second_cluster_coord_cursor, "SET INSTANCE instance_3 TO MAIN")
|
||||
|
||||
# 4
|
||||
|
||||
def show_repl_cluster():
|
||||
return sorted(list(execute_and_fetch_all(second_cluster_coord_cursor, "SHOW INSTANCES;")))
|
||||
|
||||
expected_data_up_second_cluster = [
|
||||
("coordinator_1", "127.0.0.1:10112", "", True, "coordinator"),
|
||||
("instance_3", "", "127.0.0.1:10013", True, "main"),
|
||||
("shared_instance", "", "127.0.0.1:10011", True, "replica"),
|
||||
]
|
||||
|
||||
mg_sleep_and_assert(expected_data_up_second_cluster, show_repl_cluster)
|
||||
|
||||
# 5
|
||||
main_1_cursor = connection(7689, "main_1").cursor()
|
||||
with pytest.raises(Exception) as e:
|
||||
execute_and_fetch_all(main_1_cursor, "CREATE ();")
|
||||
assert (
|
||||
str(e.value)
|
||||
== "Replication Exception: At least one SYNC replica has not confirmed committing last transaction. Check the status of the replicas using 'SHOW REPLICAS' query."
|
||||
)
|
||||
|
||||
shared_replica_cursor = connection(7688, "shared_replica").cursor()
|
||||
res = execute_and_fetch_all(shared_replica_cursor, "MATCH (n) RETURN count(n);")[0][0]
|
||||
assert res == 0, "Old main should not replicate to 'shared' replica"
|
||||
|
||||
# 6
|
||||
main_2_cursor = connection(7687, "main_2").cursor()
|
||||
|
||||
execute_and_fetch_all(main_2_cursor, "CREATE ();")
|
||||
|
||||
shared_replica_cursor = connection(7688, "shared_replica").cursor()
|
||||
res = execute_and_fetch_all(shared_replica_cursor, "MATCH (n) RETURN count(n);")[0][0]
|
||||
assert res == 1, "New main should replicate to 'shared' replica"
|
||||
|
||||
interactive_mg_runner.stop_all()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(pytest.main([__file__, "-rA"]))
|
||||
|
@ -1,5 +1,4 @@
|
||||
# Copyright 2022 Memgraph Ltd.
|
||||
#
|
||||
# Use of this software is governed by the Business Source License
|
||||
# included in the file licenses/BSL.txt; by using this file, you agree to be bound by the terms of the Business Source
|
||||
# License, and you may not use this file except in compliance with the Business Source License.
|
||||
@ -148,6 +147,105 @@ def test_replication_works_on_failover():
|
||||
interactive_mg_runner.stop_all(MEMGRAPH_INSTANCES_DESCRIPTION)
|
||||
|
||||
|
||||
def test_replication_works_on_replica_instance_restart():
|
||||
# Goal of this test is to check the replication works after replica goes down and restarts
|
||||
# 1. We start all replicas, main and coordinator manually: we want to be able to kill them ourselves without relying on external tooling to kill processes.
|
||||
# 2. We check that main has correct state
|
||||
# 3. We kill replica
|
||||
# 4. We check that main cannot replicate to replica
|
||||
# 5. We bring replica back up
|
||||
# 6. We check that replica gets data
|
||||
safe_execute(shutil.rmtree, TEMP_DIR)
|
||||
|
||||
# 1
|
||||
interactive_mg_runner.start_all(MEMGRAPH_INSTANCES_DESCRIPTION)
|
||||
|
||||
# 2
|
||||
main_cursor = connect(host="localhost", port=7687).cursor()
|
||||
expected_data_on_main = [
|
||||
("instance_1", "127.0.0.1:10001", "sync", 0, 0, "ready"),
|
||||
("instance_2", "127.0.0.1:10002", "sync", 0, 0, "ready"),
|
||||
]
|
||||
actual_data_on_main = sorted(list(execute_and_fetch_all(main_cursor, "SHOW REPLICAS;")))
|
||||
assert actual_data_on_main == expected_data_on_main
|
||||
|
||||
# 3
|
||||
coord_cursor = connect(host="localhost", port=7690).cursor()
|
||||
|
||||
interactive_mg_runner.kill(MEMGRAPH_INSTANCES_DESCRIPTION, "instance_2")
|
||||
|
||||
def retrieve_data_show_repl_cluster():
|
||||
return sorted(list(execute_and_fetch_all(coord_cursor, "SHOW INSTANCES;")))
|
||||
|
||||
expected_data_on_coord = [
|
||||
("coordinator_1", "127.0.0.1:10111", "", True, "coordinator"),
|
||||
("instance_1", "", "127.0.0.1:10011", True, "replica"),
|
||||
("instance_2", "", "127.0.0.1:10012", False, "unknown"),
|
||||
("instance_3", "", "127.0.0.1:10013", True, "main"),
|
||||
]
|
||||
mg_sleep_and_assert(expected_data_on_coord, retrieve_data_show_repl_cluster)
|
||||
|
||||
def retrieve_data_show_replicas():
|
||||
return sorted(list(execute_and_fetch_all(main_cursor, "SHOW REPLICAS;")))
|
||||
|
||||
expected_data_on_main = [
|
||||
("instance_1", "127.0.0.1:10001", "sync", 0, 0, "ready"),
|
||||
("instance_2", "127.0.0.1:10002", "sync", 0, 0, "invalid"),
|
||||
]
|
||||
mg_sleep_and_assert(expected_data_on_main, retrieve_data_show_replicas)
|
||||
|
||||
# 4
|
||||
instance_1_cursor = connect(host="localhost", port=7688).cursor()
|
||||
with pytest.raises(Exception) as e:
|
||||
execute_and_fetch_all(main_cursor, "CREATE ();")
|
||||
assert (
|
||||
str(e.value)
|
||||
== "Replication Exception: At least one SYNC replica has not confirmed committing last transaction. Check the status of the replicas using 'SHOW REPLICAS' query."
|
||||
)
|
||||
|
||||
res_instance_1 = execute_and_fetch_all(instance_1_cursor, "MATCH (n) RETURN count(n)")[0][0]
|
||||
assert res_instance_1 == 1
|
||||
|
||||
def retrieve_data_show_replicas():
|
||||
return sorted(list(execute_and_fetch_all(main_cursor, "SHOW REPLICAS;")))
|
||||
|
||||
expected_data_on_main = [
|
||||
("instance_1", "127.0.0.1:10001", "sync", 2, 0, "ready"),
|
||||
("instance_2", "127.0.0.1:10002", "sync", 0, 0, "invalid"),
|
||||
]
|
||||
mg_sleep_and_assert(expected_data_on_main, retrieve_data_show_replicas)
|
||||
|
||||
# 5.
|
||||
|
||||
interactive_mg_runner.start(MEMGRAPH_INSTANCES_DESCRIPTION, "instance_2")
|
||||
|
||||
def retrieve_data_show_repl_cluster():
|
||||
return sorted(list(execute_and_fetch_all(coord_cursor, "SHOW INSTANCES;")))
|
||||
|
||||
expected_data_on_coord = [
|
||||
("coordinator_1", "127.0.0.1:10111", "", True, "coordinator"),
|
||||
("instance_1", "", "127.0.0.1:10011", True, "replica"),
|
||||
("instance_2", "", "127.0.0.1:10012", True, "replica"),
|
||||
("instance_3", "", "127.0.0.1:10013", True, "main"),
|
||||
]
|
||||
mg_sleep_and_assert(expected_data_on_coord, retrieve_data_show_repl_cluster)
|
||||
|
||||
def retrieve_data_show_replicas():
|
||||
return sorted(list(execute_and_fetch_all(main_cursor, "SHOW REPLICAS;")))
|
||||
|
||||
expected_data_on_main = [
|
||||
("instance_1", "127.0.0.1:10001", "sync", 2, 0, "ready"),
|
||||
("instance_2", "127.0.0.1:10002", "sync", 2, 0, "ready"),
|
||||
]
|
||||
mg_sleep_and_assert(expected_data_on_main, retrieve_data_show_replicas)
|
||||
|
||||
# 6.
|
||||
instance_2_cursor = connect(port=7689, host="localhost").cursor()
|
||||
execute_and_fetch_all(main_cursor, "CREATE ();")
|
||||
res_instance_2 = execute_and_fetch_all(instance_2_cursor, "MATCH (n) RETURN count(n)")[0][0]
|
||||
assert res_instance_2 == 2
|
||||
|
||||
|
||||
def test_show_instances():
|
||||
safe_execute(shutil.rmtree, TEMP_DIR)
|
||||
interactive_mg_runner.start_all(MEMGRAPH_INSTANCES_DESCRIPTION)
|
||||
@ -417,5 +515,20 @@ def test_automatic_failover_main_back_as_main():
|
||||
mg_sleep_and_assert([("main",)], retrieve_data_show_repl_role_instance3)
|
||||
|
||||
|
||||
def test_disable_multiple_mains():
|
||||
safe_execute(shutil.rmtree, TEMP_DIR)
|
||||
interactive_mg_runner.start_all(MEMGRAPH_INSTANCES_DESCRIPTION)
|
||||
|
||||
coord_cursor = connect(host="localhost", port=7690).cursor()
|
||||
|
||||
try:
|
||||
execute_and_fetch_all(
|
||||
coord_cursor,
|
||||
"SET INSTANCE instance_1 TO MAIN;",
|
||||
)
|
||||
except Exception as e:
|
||||
assert str(e) == "Couldn't set instance to main since there is already a main instance in cluster!"
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(pytest.main([__file__, "-rA"]))
|
@ -28,18 +28,26 @@ workloads:
|
||||
args: ["high_availability_experimental/coordinator.py"]
|
||||
<<: *ha_cluster
|
||||
|
||||
- name: "Automatic failover"
|
||||
- name: "Single coordinator"
|
||||
binary: "tests/e2e/pytest_runner.sh"
|
||||
args: ["high_availability_experimental/automatic_failover.py"]
|
||||
args: ["high_availability_experimental/single_coordinator.py"]
|
||||
|
||||
- name: "Disabled manual setting of replication cluster"
|
||||
binary: "tests/e2e/pytest_runner.sh"
|
||||
args: ["high_availability_experimental/manual_setting_replicas.py"]
|
||||
|
||||
- name: "Distributed coordinators"
|
||||
- name: "Coordinator cluster registration"
|
||||
binary: "tests/e2e/pytest_runner.sh"
|
||||
args: ["high_availability_experimental/distributed_coordinators.py"]
|
||||
args: ["high_availability_experimental/coord_cluster_registration.py"]
|
||||
|
||||
- name: "Not replicate from old main"
|
||||
binary: "tests/e2e/pytest_runner.sh"
|
||||
args: ["high_availability_experimental/not_replicate_from_old_main.py"]
|
||||
|
||||
- name: "Disable writing on main after restart"
|
||||
binary: "tests/e2e/pytest_runner.sh"
|
||||
args: ["high_availability_experimental/disable_writing_on_main_after_restart.py"]
|
||||
|
||||
- name: "Distributed coordinators"
|
||||
binary: "tests/e2e/pytest_runner.sh"
|
||||
args: ["high_availability_experimental/distributed_coords.py"]
|
||||
|
@ -151,7 +151,10 @@ class MemgraphInstanceRunner:
|
||||
|
||||
if not keep_directories:
|
||||
for folder in self.delete_on_stop or {}:
|
||||
shutil.rmtree(folder)
|
||||
try:
|
||||
shutil.rmtree(folder)
|
||||
except Exception as e:
|
||||
pass # couldn't delete folder, skip
|
||||
|
||||
def kill(self, keep_directories=False):
|
||||
if not self.is_running():
|
||||
|
@ -22,9 +22,6 @@ target_link_libraries(memgraph__e2e__memory__limit_accumulation gflags mgclient
|
||||
add_executable(memgraph__e2e__memory__limit_edge_create memory_limit_edge_create.cpp)
|
||||
target_link_libraries(memgraph__e2e__memory__limit_edge_create gflags mgclient mg-utils mg-io)
|
||||
|
||||
add_executable(memgraph__e2e__memory_limit_global_multi_thread_proc_create memory_limit_global_multi_thread_proc_create.cpp)
|
||||
target_link_libraries(memgraph__e2e__memory_limit_global_multi_thread_proc_create gflags mgclient mg-utils mg-io)
|
||||
|
||||
add_executable(memgraph__e2e__memory_limit_global_thread_alloc_proc memory_limit_global_thread_alloc_proc.cpp)
|
||||
target_link_libraries(memgraph__e2e__memory_limit_global_thread_alloc_proc gflags mgclient mg-utils mg-io)
|
||||
|
||||
|
@ -1,4 +1,4 @@
|
||||
// Copyright 2023 Memgraph Ltd.
|
||||
// Copyright 2024 Memgraph Ltd.
|
||||
//
|
||||
// Use of this software is governed by the Business Source License
|
||||
// included in the file licenses/BSL.txt; by using this file, you agree to be bound by the terms of the Business Source
|
||||
@ -44,7 +44,7 @@ int main(int argc, char **argv) {
|
||||
client->DiscardAll();
|
||||
}
|
||||
|
||||
const auto *create_query = "UNWIND range(1, 50) as u CREATE (n {string: \"Some longer string\"}) RETURN n;";
|
||||
const auto *create_query = "UNWIND range(1, 100) as u CREATE (n {string: \"Some longer string\"}) RETURN n;";
|
||||
|
||||
memgraph::utils::Timer timer;
|
||||
while (true) {
|
||||
|
@ -1,67 +0,0 @@
|
||||
// Copyright 2023 Memgraph Ltd.
|
||||
//
|
||||
// Use of this software is governed by the Business Source License
|
||||
// included in the file licenses/BSL.txt; by using this file, you agree to be bound by the terms of the Business Source
|
||||
// License, and you may not use this file except in compliance with the Business Source License.
|
||||
//
|
||||
// As of the Change Date specified in that file, in accordance with
|
||||
// the Business Source License, use of this software will be governed
|
||||
// by the Apache License, Version 2.0, included in the file
|
||||
// licenses/APL.txt.
|
||||
|
||||
#include <gflags/gflags.h>
|
||||
#include <algorithm>
|
||||
#include <exception>
|
||||
#include <ios>
|
||||
#include <iostream>
|
||||
#include <mgclient.hpp>
|
||||
|
||||
#include "utils/logging.hpp"
|
||||
#include "utils/timer.hpp"
|
||||
|
||||
DEFINE_uint64(bolt_port, 7687, "Bolt port");
|
||||
DEFINE_uint64(timeout, 120, "Timeout seconds");
|
||||
DEFINE_bool(multi_db, false, "Run test in multi db environment");
|
||||
|
||||
int main(int argc, char **argv) {
|
||||
google::SetUsageMessage("Memgraph E2E Global Memory Limit In Multi-Thread Create For Local Allocators");
|
||||
gflags::ParseCommandLineFlags(&argc, &argv, true);
|
||||
memgraph::logging::RedirectToStderr();
|
||||
|
||||
mg::Client::Init();
|
||||
|
||||
auto client =
|
||||
mg::Client::Connect({.host = "127.0.0.1", .port = static_cast<uint16_t>(FLAGS_bolt_port), .use_ssl = false});
|
||||
if (!client) {
|
||||
LOG_FATAL("Failed to connect!");
|
||||
}
|
||||
|
||||
if (FLAGS_multi_db) {
|
||||
client->Execute("CREATE DATABASE clean;");
|
||||
client->DiscardAll();
|
||||
client->Execute("USE DATABASE clean;");
|
||||
client->DiscardAll();
|
||||
client->Execute("MATCH (n) DETACH DELETE n;");
|
||||
client->DiscardAll();
|
||||
}
|
||||
|
||||
bool error{false};
|
||||
try {
|
||||
client->Execute(
|
||||
"CALL libglobal_memory_limit_multi_thread_create_proc.multi_create() PROCEDURE MEMORY UNLIMITED YIELD "
|
||||
"allocated_all RETURN allocated_all "
|
||||
"QUERY MEMORY LIMIT 50MB;");
|
||||
auto result_rows = client->FetchAll();
|
||||
if (result_rows) {
|
||||
auto row = *result_rows->begin();
|
||||
error = !row[0].ValueBool();
|
||||
}
|
||||
|
||||
} catch (const std::exception &e) {
|
||||
error = true;
|
||||
}
|
||||
|
||||
MG_ASSERT(error, "Error should have happend");
|
||||
|
||||
return 0;
|
||||
}
|
@ -6,7 +6,7 @@ target_include_directories(global_memory_limit_proc PRIVATE ${CMAKE_SOURCE_DIR}/
|
||||
|
||||
add_library(query_memory_limit_proc_multi_thread SHARED query_memory_limit_proc_multi_thread.cpp)
|
||||
target_include_directories(query_memory_limit_proc_multi_thread PRIVATE ${CMAKE_SOURCE_DIR}/include)
|
||||
target_link_libraries(query_memory_limit_proc_multi_thread mg-utils)
|
||||
target_link_libraries(query_memory_limit_proc_multi_thread mg-utils )
|
||||
|
||||
add_library(query_memory_limit_proc SHARED query_memory_limit_proc.cpp)
|
||||
target_include_directories(query_memory_limit_proc PRIVATE ${CMAKE_SOURCE_DIR}/include)
|
||||
@ -16,10 +16,6 @@ add_library(global_memory_limit_thread_proc SHARED global_memory_limit_thread_pr
|
||||
target_include_directories(global_memory_limit_thread_proc PRIVATE ${CMAKE_SOURCE_DIR}/include)
|
||||
target_link_libraries(global_memory_limit_thread_proc mg-utils)
|
||||
|
||||
add_library(global_memory_limit_multi_thread_create_proc SHARED global_memory_limit_multi_thread_create_proc.cpp)
|
||||
target_include_directories(global_memory_limit_multi_thread_create_proc PRIVATE ${CMAKE_SOURCE_DIR}/include)
|
||||
target_link_libraries(global_memory_limit_multi_thread_create_proc mg-utils)
|
||||
|
||||
add_library(proc_memory_limit SHARED proc_memory_limit.cpp)
|
||||
target_include_directories(proc_memory_limit PRIVATE ${CMAKE_SOURCE_DIR}/include)
|
||||
target_link_libraries(proc_memory_limit mg-utils)
|
||||
|
@ -1,95 +0,0 @@
|
||||
// Copyright 2023 Memgraph Ltd.
|
||||
//
|
||||
// Use of this software is governed by the Business Source License
|
||||
// included in the file licenses/BSL.txt; by using this file, you agree to be bound by the terms of the Business Source
|
||||
// License, and you may not use this file except in compliance with the Business Source License.
|
||||
//
|
||||
// As of the Change Date specified in that file, in accordance with
|
||||
// the Business Source License, use of this software will be governed
|
||||
// by the Apache License, Version 2.0, included in the file
|
||||
// licenses/APL.txt.
|
||||
|
||||
#include <atomic>
|
||||
#include <cassert>
|
||||
#include <exception>
|
||||
#include <functional>
|
||||
|
||||
#include <mutex>
|
||||
#include <sstream>
|
||||
#include <string>
|
||||
#include <thread>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
#include "mg_procedure.h"
|
||||
#include "mgp.hpp"
|
||||
#include "utils/on_scope_exit.hpp"
|
||||
|
||||
// change communication between threads with feature and promise
|
||||
std::atomic<int> created_vertices{0};
|
||||
constexpr int num_vertices_per_thread{100'000};
|
||||
constexpr int num_threads{2};
|
||||
|
||||
void CallCreate(mgp_graph *graph, mgp_memory *memory) {
|
||||
[[maybe_unused]] const enum mgp_error tracking_error = mgp_track_current_thread_allocations(graph);
|
||||
for (int i = 0; i < num_vertices_per_thread; i++) {
|
||||
struct mgp_vertex *vertex{nullptr};
|
||||
auto enum_error = mgp_graph_create_vertex(graph, memory, &vertex);
|
||||
if (enum_error != mgp_error::MGP_ERROR_NO_ERROR) {
|
||||
break;
|
||||
}
|
||||
created_vertices.fetch_add(1, std::memory_order_acq_rel);
|
||||
}
|
||||
[[maybe_unused]] const enum mgp_error untracking_error = mgp_untrack_current_thread_allocations(graph);
|
||||
}
|
||||
|
||||
void AllocFunc(mgp_graph *graph, mgp_memory *memory) {
|
||||
try {
|
||||
CallCreate(graph, memory);
|
||||
} catch (const std::exception &e) {
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
void MultiCreate(mgp_list *args, mgp_graph *memgraph_graph, mgp_result *result, mgp_memory *memory) {
|
||||
mgp::MemoryDispatcherGuard guard{memory};
|
||||
const auto arguments = mgp::List(args);
|
||||
const auto record_factory = mgp::RecordFactory(result);
|
||||
try {
|
||||
std::vector<std::thread> threads;
|
||||
|
||||
for (int i = 0; i < 2; i++) {
|
||||
threads.emplace_back(AllocFunc, memgraph_graph, memory);
|
||||
}
|
||||
|
||||
for (int i = 0; i < num_threads; i++) {
|
||||
threads[i].join();
|
||||
}
|
||||
if (created_vertices.load(std::memory_order_acquire) != num_vertices_per_thread * num_threads) {
|
||||
record_factory.SetErrorMessage("Unable to allocate");
|
||||
return;
|
||||
}
|
||||
|
||||
auto new_record = record_factory.NewRecord();
|
||||
new_record.Insert("allocated_all",
|
||||
created_vertices.load(std::memory_order_acquire) == num_vertices_per_thread * num_threads);
|
||||
} catch (std::exception &e) {
|
||||
record_factory.SetErrorMessage(e.what());
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" int mgp_init_module(struct mgp_module *module, struct mgp_memory *memory) {
|
||||
try {
|
||||
mgp::MemoryDispatcherGuard guard{memory};
|
||||
|
||||
AddProcedure(MultiCreate, std::string("multi_create").c_str(), mgp::ProcedureType::Write, {},
|
||||
{mgp::Return(std::string("allocated_all").c_str(), mgp::Type::Bool)}, module, memory);
|
||||
|
||||
} catch (const std::exception &e) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
extern "C" int mgp_shutdown_module() { return 0; }
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user