Fix bug prone add server to cluster behavior (#1792)

2024-03-07 12:10:33 +01:00 · 2024-03-07 12:10:33 +01:00 · 02325f8673
commit 02325f8673
parent 6f849a14df
3 changed files with 152 additions and 4 deletions
--- a/src/coordination/raft_state.cpp
+++ b/src/coordination/raft_state.cpp
@ -11,10 +11,11 @@

 #ifdef MG_ENTERPRISE

-#include "coordination/raft_state.hpp"
+#include <chrono>

 #include "coordination/coordinator_config.hpp"
 #include "coordination/coordinator_exceptions.hpp"
+#include "coordination/raft_state.hpp"
 #include "utils/counter.hpp"

 namespace memgraph::coordination {
@ -110,10 +111,34 @@ auto RaftState::AddCoordinatorInstance(uint32_t raft_server_id, uint32_t raft_po
    -> void {
  auto const endpoint = fmt::format("{}:{}", raft_address, raft_port);
  srv_config const srv_config_to_add(static_cast<int>(raft_server_id), endpoint);
-  if (!raft_server_->add_srv(srv_config_to_add)->get_accepted()) {
-    throw RaftAddServerException("Failed to add server {} to the cluster", endpoint);
+
+  auto cmd_result = raft_server_->add_srv(srv_config_to_add);
+
+  if (cmd_result->get_result_code() == nuraft::cmd_result_code::OK) {
+    spdlog::info("Request to add server {} to the cluster accepted", endpoint);
+  } else {
+    throw RaftAddServerException("Failed to accept request to add server {} to the cluster", endpoint);
+  }
+
+  // Waiting for server to join
+  constexpr int max_tries{10};
+  auto maybe_stop = utils::ResettableCounter<max_tries>();
+  constexpr int waiting_period{200};
+  bool added{false};
+  while (!maybe_stop()) {
+    std::this_thread::sleep_for(std::chrono::milliseconds(waiting_period));
+    const auto server_config = raft_server_->get_srv_config(static_cast<nuraft::int32>(raft_server_id));
+    if (server_config) {
+      spdlog::trace("Server with id {} added to cluster", raft_server_id);
+      added = true;
+      break;
+    }
+  }
+
+  if (!added) {
+    throw RaftAddServerException("Failed to add server {} to the cluster in {}ms", endpoint,
+                                 max_tries * waiting_period);
  }
-  spdlog::info("Request to add server {} to the cluster accepted", endpoint);
 }

 auto RaftState::GetAllCoordinators() const -> std::vector<ptr<srv_config>> {
--- a/tests/e2e/high_availability/coord_cluster_registration.py
+++ b/tests/e2e/high_availability/coord_cluster_registration.py
@ -430,6 +430,7 @@ def test_unregister_main():
    coordinator2_cursor = connect(host="localhost", port=7691).cursor()
    coordinator3_cursor = connect(host="localhost", port=7692).cursor()
    assert add_coordinator(coordinator3_cursor, "ADD COORDINATOR 1 ON '127.0.0.1:10111'")
+
    assert add_coordinator(coordinator3_cursor, "ADD COORDINATOR 2 ON '127.0.0.1:10112'")
    execute_and_fetch_all(
        coordinator3_cursor, "REGISTER INSTANCE instance_1 ON '127.0.0.1:10011' WITH '127.0.0.1:10001'"
--- a/tests/e2e/high_availability/distributed_coords.py
+++ b/tests/e2e/high_availability/distributed_coords.py
@ -292,5 +292,127 @@ def test_distributed_automatic_failover_after_coord_dies():
    mg_sleep_and_assert_collection(expected_data_on_new_main_old_alive, retrieve_data_show_replicas)


+def test_registering_4_coords():
+    # Goal of this test is to assure registering of multiple coordinators in row works
+    INSTANCES_DESCRIPTION = {
+        "instance_1": {
+            "args": [
+                "--experimental-enabled=high-availability",
+                "--bolt-port",
+                "7687",
+                "--log-level",
+                "TRACE",
+                "--coordinator-server-port",
+                "10011",
+            ],
+            "log_file": "instance_1.log",
+            "data_directory": f"{TEMP_DIR}/instance_1",
+            "setup_queries": [],
+        },
+        "instance_2": {
+            "args": [
+                "--experimental-enabled=high-availability",
+                "--bolt-port",
+                "7688",
+                "--log-level",
+                "TRACE",
+                "--coordinator-server-port",
+                "10012",
+            ],
+            "log_file": "instance_2.log",
+            "data_directory": f"{TEMP_DIR}/instance_2",
+            "setup_queries": [],
+        },
+        "instance_3": {
+            "args": [
+                "--experimental-enabled=high-availability",
+                "--bolt-port",
+                "7689",
+                "--log-level",
+                "TRACE",
+                "--coordinator-server-port",
+                "10013",
+            ],
+            "log_file": "instance_3.log",
+            "data_directory": f"{TEMP_DIR}/instance_3",
+            "setup_queries": [],
+        },
+        "coordinator_1": {
+            "args": [
+                "--experimental-enabled=high-availability",
+                "--bolt-port",
+                "7690",
+                "--log-level=TRACE",
+                "--raft-server-id=1",
+                "--raft-server-port=10111",
+            ],
+            "log_file": "coordinator1.log",
+            "setup_queries": [],
+        },
+        "coordinator_2": {
+            "args": [
+                "--experimental-enabled=high-availability",
+                "--bolt-port",
+                "7691",
+                "--log-level=TRACE",
+                "--raft-server-id=2",
+                "--raft-server-port=10112",
+            ],
+            "log_file": "coordinator2.log",
+            "setup_queries": [],
+        },
+        "coordinator_3": {
+            "args": [
+                "--experimental-enabled=high-availability",
+                "--bolt-port",
+                "7692",
+                "--log-level=TRACE",
+                "--raft-server-id=3",
+                "--raft-server-port=10113",
+            ],
+            "log_file": "coordinator3.log",
+            "setup_queries": [],
+        },
+        "coordinator_4": {
+            "args": [
+                "--experimental-enabled=high-availability",
+                "--bolt-port",
+                "7693",
+                "--log-level=TRACE",
+                "--raft-server-id=4",
+                "--raft-server-port=10114",
+            ],
+            "log_file": "coordinator4.log",
+            "setup_queries": [
+                "ADD COORDINATOR 1 ON '127.0.0.1:10111';",
+                "ADD COORDINATOR 2 ON '127.0.0.1:10112';",
+                "ADD COORDINATOR 3 ON '127.0.0.1:10113';",
+                "REGISTER INSTANCE instance_1 ON '127.0.0.1:10011' WITH '127.0.0.1:10001'",
+                "REGISTER INSTANCE instance_2 ON '127.0.0.1:10012' WITH '127.0.0.1:10002'",
+                "REGISTER INSTANCE instance_3 ON '127.0.0.1:10013' WITH '127.0.0.1:10003'",
+                "SET INSTANCE instance_3 TO MAIN",
+            ],
+        },
+    }
+
+    interactive_mg_runner.start_all(INSTANCES_DESCRIPTION)
+
+    coord_cursor = connect(host="localhost", port=7693).cursor()
+
+    def retrieve_data_show_repl_cluster():
+        return sorted(list(execute_and_fetch_all(coord_cursor, "SHOW INSTANCES;")))
+
+    expected_data_on_coord = [
+        ("coordinator_1", "127.0.0.1:10111", "", "unknown", "coordinator"),
+        ("coordinator_2", "127.0.0.1:10112", "", "unknown", "coordinator"),
+        ("coordinator_3", "127.0.0.1:10113", "", "unknown", "coordinator"),
+        ("coordinator_4", "127.0.0.1:10114", "", "unknown", "coordinator"),
+        ("instance_1", "", "127.0.0.1:10011", "up", "replica"),
+        ("instance_2", "", "127.0.0.1:10012", "up", "replica"),
+        ("instance_3", "", "127.0.0.1:10013", "up", "main"),
+    ]
+    mg_sleep_and_assert(expected_data_on_coord, retrieve_data_show_repl_cluster)
+
+
 if __name__ == "__main__":
    sys.exit(pytest.main([__file__, "-rA"]))