Fix bug prone add server to cluster behavior (#1792)
This commit is contained in:
parent
6f849a14df
commit
02325f8673
@ -11,10 +11,11 @@
|
|||||||
|
|
||||||
#ifdef MG_ENTERPRISE
|
#ifdef MG_ENTERPRISE
|
||||||
|
|
||||||
#include "coordination/raft_state.hpp"
|
#include <chrono>
|
||||||
|
|
||||||
#include "coordination/coordinator_config.hpp"
|
#include "coordination/coordinator_config.hpp"
|
||||||
#include "coordination/coordinator_exceptions.hpp"
|
#include "coordination/coordinator_exceptions.hpp"
|
||||||
|
#include "coordination/raft_state.hpp"
|
||||||
#include "utils/counter.hpp"
|
#include "utils/counter.hpp"
|
||||||
|
|
||||||
namespace memgraph::coordination {
|
namespace memgraph::coordination {
|
||||||
@ -110,10 +111,34 @@ auto RaftState::AddCoordinatorInstance(uint32_t raft_server_id, uint32_t raft_po
|
|||||||
-> void {
|
-> void {
|
||||||
auto const endpoint = fmt::format("{}:{}", raft_address, raft_port);
|
auto const endpoint = fmt::format("{}:{}", raft_address, raft_port);
|
||||||
srv_config const srv_config_to_add(static_cast<int>(raft_server_id), endpoint);
|
srv_config const srv_config_to_add(static_cast<int>(raft_server_id), endpoint);
|
||||||
if (!raft_server_->add_srv(srv_config_to_add)->get_accepted()) {
|
|
||||||
throw RaftAddServerException("Failed to add server {} to the cluster", endpoint);
|
auto cmd_result = raft_server_->add_srv(srv_config_to_add);
|
||||||
|
|
||||||
|
if (cmd_result->get_result_code() == nuraft::cmd_result_code::OK) {
|
||||||
|
spdlog::info("Request to add server {} to the cluster accepted", endpoint);
|
||||||
|
} else {
|
||||||
|
throw RaftAddServerException("Failed to accept request to add server {} to the cluster", endpoint);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Waiting for server to join
|
||||||
|
constexpr int max_tries{10};
|
||||||
|
auto maybe_stop = utils::ResettableCounter<max_tries>();
|
||||||
|
constexpr int waiting_period{200};
|
||||||
|
bool added{false};
|
||||||
|
while (!maybe_stop()) {
|
||||||
|
std::this_thread::sleep_for(std::chrono::milliseconds(waiting_period));
|
||||||
|
const auto server_config = raft_server_->get_srv_config(static_cast<nuraft::int32>(raft_server_id));
|
||||||
|
if (server_config) {
|
||||||
|
spdlog::trace("Server with id {} added to cluster", raft_server_id);
|
||||||
|
added = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!added) {
|
||||||
|
throw RaftAddServerException("Failed to add server {} to the cluster in {}ms", endpoint,
|
||||||
|
max_tries * waiting_period);
|
||||||
}
|
}
|
||||||
spdlog::info("Request to add server {} to the cluster accepted", endpoint);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
auto RaftState::GetAllCoordinators() const -> std::vector<ptr<srv_config>> {
|
auto RaftState::GetAllCoordinators() const -> std::vector<ptr<srv_config>> {
|
||||||
|
@ -430,6 +430,7 @@ def test_unregister_main():
|
|||||||
coordinator2_cursor = connect(host="localhost", port=7691).cursor()
|
coordinator2_cursor = connect(host="localhost", port=7691).cursor()
|
||||||
coordinator3_cursor = connect(host="localhost", port=7692).cursor()
|
coordinator3_cursor = connect(host="localhost", port=7692).cursor()
|
||||||
assert add_coordinator(coordinator3_cursor, "ADD COORDINATOR 1 ON '127.0.0.1:10111'")
|
assert add_coordinator(coordinator3_cursor, "ADD COORDINATOR 1 ON '127.0.0.1:10111'")
|
||||||
|
|
||||||
assert add_coordinator(coordinator3_cursor, "ADD COORDINATOR 2 ON '127.0.0.1:10112'")
|
assert add_coordinator(coordinator3_cursor, "ADD COORDINATOR 2 ON '127.0.0.1:10112'")
|
||||||
execute_and_fetch_all(
|
execute_and_fetch_all(
|
||||||
coordinator3_cursor, "REGISTER INSTANCE instance_1 ON '127.0.0.1:10011' WITH '127.0.0.1:10001'"
|
coordinator3_cursor, "REGISTER INSTANCE instance_1 ON '127.0.0.1:10011' WITH '127.0.0.1:10001'"
|
||||||
|
@ -292,5 +292,127 @@ def test_distributed_automatic_failover_after_coord_dies():
|
|||||||
mg_sleep_and_assert_collection(expected_data_on_new_main_old_alive, retrieve_data_show_replicas)
|
mg_sleep_and_assert_collection(expected_data_on_new_main_old_alive, retrieve_data_show_replicas)
|
||||||
|
|
||||||
|
|
||||||
|
def test_registering_4_coords():
|
||||||
|
# Goal of this test is to assure registering of multiple coordinators in row works
|
||||||
|
INSTANCES_DESCRIPTION = {
|
||||||
|
"instance_1": {
|
||||||
|
"args": [
|
||||||
|
"--experimental-enabled=high-availability",
|
||||||
|
"--bolt-port",
|
||||||
|
"7687",
|
||||||
|
"--log-level",
|
||||||
|
"TRACE",
|
||||||
|
"--coordinator-server-port",
|
||||||
|
"10011",
|
||||||
|
],
|
||||||
|
"log_file": "instance_1.log",
|
||||||
|
"data_directory": f"{TEMP_DIR}/instance_1",
|
||||||
|
"setup_queries": [],
|
||||||
|
},
|
||||||
|
"instance_2": {
|
||||||
|
"args": [
|
||||||
|
"--experimental-enabled=high-availability",
|
||||||
|
"--bolt-port",
|
||||||
|
"7688",
|
||||||
|
"--log-level",
|
||||||
|
"TRACE",
|
||||||
|
"--coordinator-server-port",
|
||||||
|
"10012",
|
||||||
|
],
|
||||||
|
"log_file": "instance_2.log",
|
||||||
|
"data_directory": f"{TEMP_DIR}/instance_2",
|
||||||
|
"setup_queries": [],
|
||||||
|
},
|
||||||
|
"instance_3": {
|
||||||
|
"args": [
|
||||||
|
"--experimental-enabled=high-availability",
|
||||||
|
"--bolt-port",
|
||||||
|
"7689",
|
||||||
|
"--log-level",
|
||||||
|
"TRACE",
|
||||||
|
"--coordinator-server-port",
|
||||||
|
"10013",
|
||||||
|
],
|
||||||
|
"log_file": "instance_3.log",
|
||||||
|
"data_directory": f"{TEMP_DIR}/instance_3",
|
||||||
|
"setup_queries": [],
|
||||||
|
},
|
||||||
|
"coordinator_1": {
|
||||||
|
"args": [
|
||||||
|
"--experimental-enabled=high-availability",
|
||||||
|
"--bolt-port",
|
||||||
|
"7690",
|
||||||
|
"--log-level=TRACE",
|
||||||
|
"--raft-server-id=1",
|
||||||
|
"--raft-server-port=10111",
|
||||||
|
],
|
||||||
|
"log_file": "coordinator1.log",
|
||||||
|
"setup_queries": [],
|
||||||
|
},
|
||||||
|
"coordinator_2": {
|
||||||
|
"args": [
|
||||||
|
"--experimental-enabled=high-availability",
|
||||||
|
"--bolt-port",
|
||||||
|
"7691",
|
||||||
|
"--log-level=TRACE",
|
||||||
|
"--raft-server-id=2",
|
||||||
|
"--raft-server-port=10112",
|
||||||
|
],
|
||||||
|
"log_file": "coordinator2.log",
|
||||||
|
"setup_queries": [],
|
||||||
|
},
|
||||||
|
"coordinator_3": {
|
||||||
|
"args": [
|
||||||
|
"--experimental-enabled=high-availability",
|
||||||
|
"--bolt-port",
|
||||||
|
"7692",
|
||||||
|
"--log-level=TRACE",
|
||||||
|
"--raft-server-id=3",
|
||||||
|
"--raft-server-port=10113",
|
||||||
|
],
|
||||||
|
"log_file": "coordinator3.log",
|
||||||
|
"setup_queries": [],
|
||||||
|
},
|
||||||
|
"coordinator_4": {
|
||||||
|
"args": [
|
||||||
|
"--experimental-enabled=high-availability",
|
||||||
|
"--bolt-port",
|
||||||
|
"7693",
|
||||||
|
"--log-level=TRACE",
|
||||||
|
"--raft-server-id=4",
|
||||||
|
"--raft-server-port=10114",
|
||||||
|
],
|
||||||
|
"log_file": "coordinator4.log",
|
||||||
|
"setup_queries": [
|
||||||
|
"ADD COORDINATOR 1 ON '127.0.0.1:10111';",
|
||||||
|
"ADD COORDINATOR 2 ON '127.0.0.1:10112';",
|
||||||
|
"ADD COORDINATOR 3 ON '127.0.0.1:10113';",
|
||||||
|
"REGISTER INSTANCE instance_1 ON '127.0.0.1:10011' WITH '127.0.0.1:10001'",
|
||||||
|
"REGISTER INSTANCE instance_2 ON '127.0.0.1:10012' WITH '127.0.0.1:10002'",
|
||||||
|
"REGISTER INSTANCE instance_3 ON '127.0.0.1:10013' WITH '127.0.0.1:10003'",
|
||||||
|
"SET INSTANCE instance_3 TO MAIN",
|
||||||
|
],
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
interactive_mg_runner.start_all(INSTANCES_DESCRIPTION)
|
||||||
|
|
||||||
|
coord_cursor = connect(host="localhost", port=7693).cursor()
|
||||||
|
|
||||||
|
def retrieve_data_show_repl_cluster():
|
||||||
|
return sorted(list(execute_and_fetch_all(coord_cursor, "SHOW INSTANCES;")))
|
||||||
|
|
||||||
|
expected_data_on_coord = [
|
||||||
|
("coordinator_1", "127.0.0.1:10111", "", "unknown", "coordinator"),
|
||||||
|
("coordinator_2", "127.0.0.1:10112", "", "unknown", "coordinator"),
|
||||||
|
("coordinator_3", "127.0.0.1:10113", "", "unknown", "coordinator"),
|
||||||
|
("coordinator_4", "127.0.0.1:10114", "", "unknown", "coordinator"),
|
||||||
|
("instance_1", "", "127.0.0.1:10011", "up", "replica"),
|
||||||
|
("instance_2", "", "127.0.0.1:10012", "up", "replica"),
|
||||||
|
("instance_3", "", "127.0.0.1:10013", "up", "main"),
|
||||||
|
]
|
||||||
|
mg_sleep_and_assert(expected_data_on_coord, retrieve_data_show_repl_cluster)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
sys.exit(pytest.main([__file__, "-rA"]))
|
sys.exit(pytest.main([__file__, "-rA"]))
|
||||||
|
Loading…
Reference in New Issue
Block a user