diff --git a/src/coordination/raft_state.cpp b/src/coordination/raft_state.cpp index d4d65cc36..dd441db74 100644 --- a/src/coordination/raft_state.cpp +++ b/src/coordination/raft_state.cpp @@ -11,10 +11,11 @@ #ifdef MG_ENTERPRISE -#include "coordination/raft_state.hpp" +#include #include "coordination/coordinator_config.hpp" #include "coordination/coordinator_exceptions.hpp" +#include "coordination/raft_state.hpp" #include "utils/counter.hpp" namespace memgraph::coordination { @@ -110,10 +111,34 @@ auto RaftState::AddCoordinatorInstance(uint32_t raft_server_id, uint32_t raft_po -> void { auto const endpoint = fmt::format("{}:{}", raft_address, raft_port); srv_config const srv_config_to_add(static_cast(raft_server_id), endpoint); - if (!raft_server_->add_srv(srv_config_to_add)->get_accepted()) { - throw RaftAddServerException("Failed to add server {} to the cluster", endpoint); + + auto cmd_result = raft_server_->add_srv(srv_config_to_add); + + if (cmd_result->get_result_code() == nuraft::cmd_result_code::OK) { + spdlog::info("Request to add server {} to the cluster accepted", endpoint); + } else { + throw RaftAddServerException("Failed to accept request to add server {} to the cluster", endpoint); + } + + // Waiting for server to join + constexpr int max_tries{10}; + auto maybe_stop = utils::ResettableCounter(); + constexpr int waiting_period{200}; + bool added{false}; + while (!maybe_stop()) { + std::this_thread::sleep_for(std::chrono::milliseconds(waiting_period)); + const auto server_config = raft_server_->get_srv_config(static_cast(raft_server_id)); + if (server_config) { + spdlog::trace("Server with id {} added to cluster", raft_server_id); + added = true; + break; + } + } + + if (!added) { + throw RaftAddServerException("Failed to add server {} to the cluster in {}ms", endpoint, + max_tries * waiting_period); } - spdlog::info("Request to add server {} to the cluster accepted", endpoint); } auto RaftState::GetAllCoordinators() const -> std::vector> { diff --git a/tests/e2e/high_availability/coord_cluster_registration.py b/tests/e2e/high_availability/coord_cluster_registration.py index 774c6dca1..a285adcea 100644 --- a/tests/e2e/high_availability/coord_cluster_registration.py +++ b/tests/e2e/high_availability/coord_cluster_registration.py @@ -430,6 +430,7 @@ def test_unregister_main(): coordinator2_cursor = connect(host="localhost", port=7691).cursor() coordinator3_cursor = connect(host="localhost", port=7692).cursor() assert add_coordinator(coordinator3_cursor, "ADD COORDINATOR 1 ON '127.0.0.1:10111'") + assert add_coordinator(coordinator3_cursor, "ADD COORDINATOR 2 ON '127.0.0.1:10112'") execute_and_fetch_all( coordinator3_cursor, "REGISTER INSTANCE instance_1 ON '127.0.0.1:10011' WITH '127.0.0.1:10001'" diff --git a/tests/e2e/high_availability/distributed_coords.py b/tests/e2e/high_availability/distributed_coords.py index 9fa654d68..33901f1d4 100644 --- a/tests/e2e/high_availability/distributed_coords.py +++ b/tests/e2e/high_availability/distributed_coords.py @@ -292,5 +292,127 @@ def test_distributed_automatic_failover_after_coord_dies(): mg_sleep_and_assert_collection(expected_data_on_new_main_old_alive, retrieve_data_show_replicas) +def test_registering_4_coords(): + # Goal of this test is to assure registering of multiple coordinators in row works + INSTANCES_DESCRIPTION = { + "instance_1": { + "args": [ + "--experimental-enabled=high-availability", + "--bolt-port", + "7687", + "--log-level", + "TRACE", + "--coordinator-server-port", + "10011", + ], + "log_file": "instance_1.log", + "data_directory": f"{TEMP_DIR}/instance_1", + "setup_queries": [], + }, + "instance_2": { + "args": [ + "--experimental-enabled=high-availability", + "--bolt-port", + "7688", + "--log-level", + "TRACE", + "--coordinator-server-port", + "10012", + ], + "log_file": "instance_2.log", + "data_directory": f"{TEMP_DIR}/instance_2", + "setup_queries": [], + }, + "instance_3": { + "args": [ + "--experimental-enabled=high-availability", + "--bolt-port", + "7689", + "--log-level", + "TRACE", + "--coordinator-server-port", + "10013", + ], + "log_file": "instance_3.log", + "data_directory": f"{TEMP_DIR}/instance_3", + "setup_queries": [], + }, + "coordinator_1": { + "args": [ + "--experimental-enabled=high-availability", + "--bolt-port", + "7690", + "--log-level=TRACE", + "--raft-server-id=1", + "--raft-server-port=10111", + ], + "log_file": "coordinator1.log", + "setup_queries": [], + }, + "coordinator_2": { + "args": [ + "--experimental-enabled=high-availability", + "--bolt-port", + "7691", + "--log-level=TRACE", + "--raft-server-id=2", + "--raft-server-port=10112", + ], + "log_file": "coordinator2.log", + "setup_queries": [], + }, + "coordinator_3": { + "args": [ + "--experimental-enabled=high-availability", + "--bolt-port", + "7692", + "--log-level=TRACE", + "--raft-server-id=3", + "--raft-server-port=10113", + ], + "log_file": "coordinator3.log", + "setup_queries": [], + }, + "coordinator_4": { + "args": [ + "--experimental-enabled=high-availability", + "--bolt-port", + "7693", + "--log-level=TRACE", + "--raft-server-id=4", + "--raft-server-port=10114", + ], + "log_file": "coordinator4.log", + "setup_queries": [ + "ADD COORDINATOR 1 ON '127.0.0.1:10111';", + "ADD COORDINATOR 2 ON '127.0.0.1:10112';", + "ADD COORDINATOR 3 ON '127.0.0.1:10113';", + "REGISTER INSTANCE instance_1 ON '127.0.0.1:10011' WITH '127.0.0.1:10001'", + "REGISTER INSTANCE instance_2 ON '127.0.0.1:10012' WITH '127.0.0.1:10002'", + "REGISTER INSTANCE instance_3 ON '127.0.0.1:10013' WITH '127.0.0.1:10003'", + "SET INSTANCE instance_3 TO MAIN", + ], + }, + } + + interactive_mg_runner.start_all(INSTANCES_DESCRIPTION) + + coord_cursor = connect(host="localhost", port=7693).cursor() + + def retrieve_data_show_repl_cluster(): + return sorted(list(execute_and_fetch_all(coord_cursor, "SHOW INSTANCES;"))) + + expected_data_on_coord = [ + ("coordinator_1", "127.0.0.1:10111", "", "unknown", "coordinator"), + ("coordinator_2", "127.0.0.1:10112", "", "unknown", "coordinator"), + ("coordinator_3", "127.0.0.1:10113", "", "unknown", "coordinator"), + ("coordinator_4", "127.0.0.1:10114", "", "unknown", "coordinator"), + ("instance_1", "", "127.0.0.1:10011", "up", "replica"), + ("instance_2", "", "127.0.0.1:10012", "up", "replica"), + ("instance_3", "", "127.0.0.1:10013", "up", "main"), + ] + mg_sleep_and_assert(expected_data_on_coord, retrieve_data_show_repl_cluster) + + if __name__ == "__main__": sys.exit(pytest.main([__file__, "-rA"]))