Fix bug prone add server to cluster behavior (#1792)
This commit is contained in:
parent
6f849a14df
commit
02325f8673
@ -11,10 +11,11 @@
|
||||
|
||||
#ifdef MG_ENTERPRISE
|
||||
|
||||
#include "coordination/raft_state.hpp"
|
||||
#include <chrono>
|
||||
|
||||
#include "coordination/coordinator_config.hpp"
|
||||
#include "coordination/coordinator_exceptions.hpp"
|
||||
#include "coordination/raft_state.hpp"
|
||||
#include "utils/counter.hpp"
|
||||
|
||||
namespace memgraph::coordination {
|
||||
@ -110,10 +111,34 @@ auto RaftState::AddCoordinatorInstance(uint32_t raft_server_id, uint32_t raft_po
|
||||
-> void {
|
||||
auto const endpoint = fmt::format("{}:{}", raft_address, raft_port);
|
||||
srv_config const srv_config_to_add(static_cast<int>(raft_server_id), endpoint);
|
||||
if (!raft_server_->add_srv(srv_config_to_add)->get_accepted()) {
|
||||
throw RaftAddServerException("Failed to add server {} to the cluster", endpoint);
|
||||
|
||||
auto cmd_result = raft_server_->add_srv(srv_config_to_add);
|
||||
|
||||
if (cmd_result->get_result_code() == nuraft::cmd_result_code::OK) {
|
||||
spdlog::info("Request to add server {} to the cluster accepted", endpoint);
|
||||
} else {
|
||||
throw RaftAddServerException("Failed to accept request to add server {} to the cluster", endpoint);
|
||||
}
|
||||
|
||||
// Waiting for server to join
|
||||
constexpr int max_tries{10};
|
||||
auto maybe_stop = utils::ResettableCounter<max_tries>();
|
||||
constexpr int waiting_period{200};
|
||||
bool added{false};
|
||||
while (!maybe_stop()) {
|
||||
std::this_thread::sleep_for(std::chrono::milliseconds(waiting_period));
|
||||
const auto server_config = raft_server_->get_srv_config(static_cast<nuraft::int32>(raft_server_id));
|
||||
if (server_config) {
|
||||
spdlog::trace("Server with id {} added to cluster", raft_server_id);
|
||||
added = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (!added) {
|
||||
throw RaftAddServerException("Failed to add server {} to the cluster in {}ms", endpoint,
|
||||
max_tries * waiting_period);
|
||||
}
|
||||
spdlog::info("Request to add server {} to the cluster accepted", endpoint);
|
||||
}
|
||||
|
||||
auto RaftState::GetAllCoordinators() const -> std::vector<ptr<srv_config>> {
|
||||
|
@ -430,6 +430,7 @@ def test_unregister_main():
|
||||
coordinator2_cursor = connect(host="localhost", port=7691).cursor()
|
||||
coordinator3_cursor = connect(host="localhost", port=7692).cursor()
|
||||
assert add_coordinator(coordinator3_cursor, "ADD COORDINATOR 1 ON '127.0.0.1:10111'")
|
||||
|
||||
assert add_coordinator(coordinator3_cursor, "ADD COORDINATOR 2 ON '127.0.0.1:10112'")
|
||||
execute_and_fetch_all(
|
||||
coordinator3_cursor, "REGISTER INSTANCE instance_1 ON '127.0.0.1:10011' WITH '127.0.0.1:10001'"
|
||||
|
@ -292,5 +292,127 @@ def test_distributed_automatic_failover_after_coord_dies():
|
||||
mg_sleep_and_assert_collection(expected_data_on_new_main_old_alive, retrieve_data_show_replicas)
|
||||
|
||||
|
||||
def test_registering_4_coords():
|
||||
# Goal of this test is to assure registering of multiple coordinators in row works
|
||||
INSTANCES_DESCRIPTION = {
|
||||
"instance_1": {
|
||||
"args": [
|
||||
"--experimental-enabled=high-availability",
|
||||
"--bolt-port",
|
||||
"7687",
|
||||
"--log-level",
|
||||
"TRACE",
|
||||
"--coordinator-server-port",
|
||||
"10011",
|
||||
],
|
||||
"log_file": "instance_1.log",
|
||||
"data_directory": f"{TEMP_DIR}/instance_1",
|
||||
"setup_queries": [],
|
||||
},
|
||||
"instance_2": {
|
||||
"args": [
|
||||
"--experimental-enabled=high-availability",
|
||||
"--bolt-port",
|
||||
"7688",
|
||||
"--log-level",
|
||||
"TRACE",
|
||||
"--coordinator-server-port",
|
||||
"10012",
|
||||
],
|
||||
"log_file": "instance_2.log",
|
||||
"data_directory": f"{TEMP_DIR}/instance_2",
|
||||
"setup_queries": [],
|
||||
},
|
||||
"instance_3": {
|
||||
"args": [
|
||||
"--experimental-enabled=high-availability",
|
||||
"--bolt-port",
|
||||
"7689",
|
||||
"--log-level",
|
||||
"TRACE",
|
||||
"--coordinator-server-port",
|
||||
"10013",
|
||||
],
|
||||
"log_file": "instance_3.log",
|
||||
"data_directory": f"{TEMP_DIR}/instance_3",
|
||||
"setup_queries": [],
|
||||
},
|
||||
"coordinator_1": {
|
||||
"args": [
|
||||
"--experimental-enabled=high-availability",
|
||||
"--bolt-port",
|
||||
"7690",
|
||||
"--log-level=TRACE",
|
||||
"--raft-server-id=1",
|
||||
"--raft-server-port=10111",
|
||||
],
|
||||
"log_file": "coordinator1.log",
|
||||
"setup_queries": [],
|
||||
},
|
||||
"coordinator_2": {
|
||||
"args": [
|
||||
"--experimental-enabled=high-availability",
|
||||
"--bolt-port",
|
||||
"7691",
|
||||
"--log-level=TRACE",
|
||||
"--raft-server-id=2",
|
||||
"--raft-server-port=10112",
|
||||
],
|
||||
"log_file": "coordinator2.log",
|
||||
"setup_queries": [],
|
||||
},
|
||||
"coordinator_3": {
|
||||
"args": [
|
||||
"--experimental-enabled=high-availability",
|
||||
"--bolt-port",
|
||||
"7692",
|
||||
"--log-level=TRACE",
|
||||
"--raft-server-id=3",
|
||||
"--raft-server-port=10113",
|
||||
],
|
||||
"log_file": "coordinator3.log",
|
||||
"setup_queries": [],
|
||||
},
|
||||
"coordinator_4": {
|
||||
"args": [
|
||||
"--experimental-enabled=high-availability",
|
||||
"--bolt-port",
|
||||
"7693",
|
||||
"--log-level=TRACE",
|
||||
"--raft-server-id=4",
|
||||
"--raft-server-port=10114",
|
||||
],
|
||||
"log_file": "coordinator4.log",
|
||||
"setup_queries": [
|
||||
"ADD COORDINATOR 1 ON '127.0.0.1:10111';",
|
||||
"ADD COORDINATOR 2 ON '127.0.0.1:10112';",
|
||||
"ADD COORDINATOR 3 ON '127.0.0.1:10113';",
|
||||
"REGISTER INSTANCE instance_1 ON '127.0.0.1:10011' WITH '127.0.0.1:10001'",
|
||||
"REGISTER INSTANCE instance_2 ON '127.0.0.1:10012' WITH '127.0.0.1:10002'",
|
||||
"REGISTER INSTANCE instance_3 ON '127.0.0.1:10013' WITH '127.0.0.1:10003'",
|
||||
"SET INSTANCE instance_3 TO MAIN",
|
||||
],
|
||||
},
|
||||
}
|
||||
|
||||
interactive_mg_runner.start_all(INSTANCES_DESCRIPTION)
|
||||
|
||||
coord_cursor = connect(host="localhost", port=7693).cursor()
|
||||
|
||||
def retrieve_data_show_repl_cluster():
|
||||
return sorted(list(execute_and_fetch_all(coord_cursor, "SHOW INSTANCES;")))
|
||||
|
||||
expected_data_on_coord = [
|
||||
("coordinator_1", "127.0.0.1:10111", "", "unknown", "coordinator"),
|
||||
("coordinator_2", "127.0.0.1:10112", "", "unknown", "coordinator"),
|
||||
("coordinator_3", "127.0.0.1:10113", "", "unknown", "coordinator"),
|
||||
("coordinator_4", "127.0.0.1:10114", "", "unknown", "coordinator"),
|
||||
("instance_1", "", "127.0.0.1:10011", "up", "replica"),
|
||||
("instance_2", "", "127.0.0.1:10012", "up", "replica"),
|
||||
("instance_3", "", "127.0.0.1:10013", "up", "main"),
|
||||
]
|
||||
mg_sleep_and_assert(expected_data_on_coord, retrieve_data_show_repl_cluster)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(pytest.main([__file__, "-rA"]))
|
||||
|
Loading…
Reference in New Issue
Block a user