Fix bug prone add server to cluster behavior (#1792)

This commit is contained in:
Antonio Filipovic 2024-03-07 12:10:33 +01:00 committed by GitHub
parent 6f849a14df
commit 02325f8673
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 152 additions and 4 deletions

View File

@ -11,10 +11,11 @@
#ifdef MG_ENTERPRISE
#include "coordination/raft_state.hpp"
#include <chrono>
#include "coordination/coordinator_config.hpp"
#include "coordination/coordinator_exceptions.hpp"
#include "coordination/raft_state.hpp"
#include "utils/counter.hpp"
namespace memgraph::coordination {
@ -110,10 +111,34 @@ auto RaftState::AddCoordinatorInstance(uint32_t raft_server_id, uint32_t raft_po
-> void {
auto const endpoint = fmt::format("{}:{}", raft_address, raft_port);
srv_config const srv_config_to_add(static_cast<int>(raft_server_id), endpoint);
if (!raft_server_->add_srv(srv_config_to_add)->get_accepted()) {
throw RaftAddServerException("Failed to add server {} to the cluster", endpoint);
auto cmd_result = raft_server_->add_srv(srv_config_to_add);
if (cmd_result->get_result_code() == nuraft::cmd_result_code::OK) {
spdlog::info("Request to add server {} to the cluster accepted", endpoint);
} else {
throw RaftAddServerException("Failed to accept request to add server {} to the cluster", endpoint);
}
// Waiting for server to join
constexpr int max_tries{10};
auto maybe_stop = utils::ResettableCounter<max_tries>();
constexpr int waiting_period{200};
bool added{false};
while (!maybe_stop()) {
std::this_thread::sleep_for(std::chrono::milliseconds(waiting_period));
const auto server_config = raft_server_->get_srv_config(static_cast<nuraft::int32>(raft_server_id));
if (server_config) {
spdlog::trace("Server with id {} added to cluster", raft_server_id);
added = true;
break;
}
}
if (!added) {
throw RaftAddServerException("Failed to add server {} to the cluster in {}ms", endpoint,
max_tries * waiting_period);
}
spdlog::info("Request to add server {} to the cluster accepted", endpoint);
}
auto RaftState::GetAllCoordinators() const -> std::vector<ptr<srv_config>> {

View File

@ -430,6 +430,7 @@ def test_unregister_main():
coordinator2_cursor = connect(host="localhost", port=7691).cursor()
coordinator3_cursor = connect(host="localhost", port=7692).cursor()
assert add_coordinator(coordinator3_cursor, "ADD COORDINATOR 1 ON '127.0.0.1:10111'")
assert add_coordinator(coordinator3_cursor, "ADD COORDINATOR 2 ON '127.0.0.1:10112'")
execute_and_fetch_all(
coordinator3_cursor, "REGISTER INSTANCE instance_1 ON '127.0.0.1:10011' WITH '127.0.0.1:10001'"

View File

@ -292,5 +292,127 @@ def test_distributed_automatic_failover_after_coord_dies():
mg_sleep_and_assert_collection(expected_data_on_new_main_old_alive, retrieve_data_show_replicas)
def test_registering_4_coords():
# Goal of this test is to assure registering of multiple coordinators in row works
INSTANCES_DESCRIPTION = {
"instance_1": {
"args": [
"--experimental-enabled=high-availability",
"--bolt-port",
"7687",
"--log-level",
"TRACE",
"--coordinator-server-port",
"10011",
],
"log_file": "instance_1.log",
"data_directory": f"{TEMP_DIR}/instance_1",
"setup_queries": [],
},
"instance_2": {
"args": [
"--experimental-enabled=high-availability",
"--bolt-port",
"7688",
"--log-level",
"TRACE",
"--coordinator-server-port",
"10012",
],
"log_file": "instance_2.log",
"data_directory": f"{TEMP_DIR}/instance_2",
"setup_queries": [],
},
"instance_3": {
"args": [
"--experimental-enabled=high-availability",
"--bolt-port",
"7689",
"--log-level",
"TRACE",
"--coordinator-server-port",
"10013",
],
"log_file": "instance_3.log",
"data_directory": f"{TEMP_DIR}/instance_3",
"setup_queries": [],
},
"coordinator_1": {
"args": [
"--experimental-enabled=high-availability",
"--bolt-port",
"7690",
"--log-level=TRACE",
"--raft-server-id=1",
"--raft-server-port=10111",
],
"log_file": "coordinator1.log",
"setup_queries": [],
},
"coordinator_2": {
"args": [
"--experimental-enabled=high-availability",
"--bolt-port",
"7691",
"--log-level=TRACE",
"--raft-server-id=2",
"--raft-server-port=10112",
],
"log_file": "coordinator2.log",
"setup_queries": [],
},
"coordinator_3": {
"args": [
"--experimental-enabled=high-availability",
"--bolt-port",
"7692",
"--log-level=TRACE",
"--raft-server-id=3",
"--raft-server-port=10113",
],
"log_file": "coordinator3.log",
"setup_queries": [],
},
"coordinator_4": {
"args": [
"--experimental-enabled=high-availability",
"--bolt-port",
"7693",
"--log-level=TRACE",
"--raft-server-id=4",
"--raft-server-port=10114",
],
"log_file": "coordinator4.log",
"setup_queries": [
"ADD COORDINATOR 1 ON '127.0.0.1:10111';",
"ADD COORDINATOR 2 ON '127.0.0.1:10112';",
"ADD COORDINATOR 3 ON '127.0.0.1:10113';",
"REGISTER INSTANCE instance_1 ON '127.0.0.1:10011' WITH '127.0.0.1:10001'",
"REGISTER INSTANCE instance_2 ON '127.0.0.1:10012' WITH '127.0.0.1:10002'",
"REGISTER INSTANCE instance_3 ON '127.0.0.1:10013' WITH '127.0.0.1:10003'",
"SET INSTANCE instance_3 TO MAIN",
],
},
}
interactive_mg_runner.start_all(INSTANCES_DESCRIPTION)
coord_cursor = connect(host="localhost", port=7693).cursor()
def retrieve_data_show_repl_cluster():
return sorted(list(execute_and_fetch_all(coord_cursor, "SHOW INSTANCES;")))
expected_data_on_coord = [
("coordinator_1", "127.0.0.1:10111", "", "unknown", "coordinator"),
("coordinator_2", "127.0.0.1:10112", "", "unknown", "coordinator"),
("coordinator_3", "127.0.0.1:10113", "", "unknown", "coordinator"),
("coordinator_4", "127.0.0.1:10114", "", "unknown", "coordinator"),
("instance_1", "", "127.0.0.1:10011", "up", "replica"),
("instance_2", "", "127.0.0.1:10012", "up", "replica"),
("instance_3", "", "127.0.0.1:10013", "up", "main"),
]
mg_sleep_and_assert(expected_data_on_coord, retrieve_data_show_repl_cluster)
if __name__ == "__main__":
sys.exit(pytest.main([__file__, "-rA"]))