From 303608982ad286b9b180975661f79de7945e920b Mon Sep 17 00:00:00 2001 From: Andi Skrgat <andi8647@gmail.com> Date: Thu, 25 Jan 2024 09:04:05 +0100 Subject: [PATCH] Remove cluster_initialized and client_initiated failover tests --- src/coordination/coordinator_data.cpp | 18 +--- src/coordination/coordinator_state.cpp | 9 +- .../include/coordination/failover_status.hpp | 2 +- src/dbms/coordinator_handlers.cpp | 25 +---- src/query/interpreter.cpp | 20 ++++ .../v2/replication/replication_client.cpp | 1 + .../CMakeLists.txt | 3 +- .../automatic_failover.py | 2 +- .../coordinator.py | 11 -- ...ailover.py => show_replication_cluster.py} | 102 ------------------ .../uninitialized_cluster.py | 26 ----- .../workloads.yaml | 31 +----- 12 files changed, 36 insertions(+), 214 deletions(-) rename tests/e2e/high_availability_experimental/{client_initiated_failover.py => show_replication_cluster.py} (53%) delete mode 100644 tests/e2e/high_availability_experimental/uninitialized_cluster.py diff --git a/src/coordination/coordinator_data.cpp b/src/coordination/coordinator_data.cpp index ce783e749..9c4b9fea6 100644 --- a/src/coordination/coordinator_data.cpp +++ b/src/coordination/coordinator_data.cpp @@ -68,9 +68,6 @@ CoordinatorData::CoordinatorData() { case MAIN_ALIVE: spdlog::warn("Failover aborted since main is alive!"); break; - case CLUSTER_UNINITIALIZED: - spdlog::warn("Failover aborted since cluster is uninitialized!"); - break; case RPC_FAILED: spdlog::warn("Failover aborted since promoting replica to main failed!"); break; @@ -85,18 +82,6 @@ auto CoordinatorData::DoFailover() -> DoFailoverStatus { using ReplicationClientInfo = CoordinatorClientConfig::ReplicationClientInfo; std::lock_guard<utils::RWLock> lock{coord_data_lock_}; - const auto main_instance = std::ranges::find_if(registered_instances_, &CoordinatorInstance::IsMain); - - if (main_instance == registered_instances_.end()) { - return DoFailoverStatus::CLUSTER_UNINITIALIZED; - } - - if (main_instance->IsAlive()) { - return DoFailoverStatus::MAIN_ALIVE; - } - - main_instance->client_.PauseFrequentCheck(); - auto replica_instances = registered_instances_ | ranges::views::filter(&CoordinatorInstance::IsReplica); auto chosen_replica_instance = std::ranges::find_if(replica_instances, &CoordinatorInstance::IsAlive); @@ -114,8 +99,8 @@ auto CoordinatorData::DoFailover() -> DoFailoverStatus { }; auto not_main = [](const CoordinatorInstance &instance) { return !instance.IsMain(); }; - // Filter not current replicas and not MAIN instance // TODO (antoniofilipovic): Should we send also data on old MAIN??? + // TODO: (andi) Don't send replicas which aren't alive for (const auto &unchosen_replica_instance : replica_instances | ranges::views::filter(not_chosen_replica_instance) | ranges::views::filter(not_main)) { repl_clients_info.emplace_back(unchosen_replica_instance.client_.ReplicationClientInfo()); @@ -127,7 +112,6 @@ auto CoordinatorData::DoFailover() -> DoFailoverStatus { } chosen_replica_instance->PostFailover(main_succ_cb_, main_fail_cb_); - main_instance->replication_role_ = replication_coordination_glue::ReplicationRole::REPLICA; return DoFailoverStatus::SUCCESS; } diff --git a/src/coordination/coordinator_state.cpp b/src/coordination/coordinator_state.cpp index 0b0b87b50..154a5d0dd 100644 --- a/src/coordination/coordinator_state.cpp +++ b/src/coordination/coordinator_state.cpp @@ -9,18 +9,19 @@ // by the Apache License, Version 2.0, included in the file // licenses/APL.txt. -#include "coordination/coordinator_state.hpp" -#include <algorithm> -#include "coordination/register_main_replica_coordinator_status.hpp" - #ifdef MG_ENTERPRISE +#include "coordination/coordinator_state.hpp" + +#include "coordination/register_main_replica_coordinator_status.hpp" #include "coordination/coordinator_config.hpp" #include "flags/replication.hpp" #include "spdlog/spdlog.h" #include "utils/logging.hpp" #include "utils/variant_helpers.hpp" +#include <algorithm> + namespace memgraph::coordination { CoordinatorState::CoordinatorState() { diff --git a/src/coordination/include/coordination/failover_status.hpp b/src/coordination/include/coordination/failover_status.hpp index d13909b5e..9cfa0ffe6 100644 --- a/src/coordination/include/coordination/failover_status.hpp +++ b/src/coordination/include/coordination/failover_status.hpp @@ -16,6 +16,6 @@ #include <cstdint> namespace memgraph::coordination { -enum class DoFailoverStatus : uint8_t { SUCCESS, ALL_REPLICAS_DOWN, MAIN_ALIVE, CLUSTER_UNINITIALIZED, RPC_FAILED }; +enum class DoFailoverStatus : uint8_t { SUCCESS, ALL_REPLICAS_DOWN, MAIN_ALIVE, RPC_FAILED }; } // namespace memgraph::coordination #endif diff --git a/src/dbms/coordinator_handlers.cpp b/src/dbms/coordinator_handlers.cpp index 3ffc62d37..cad23c0aa 100644 --- a/src/dbms/coordinator_handlers.cpp +++ b/src/dbms/coordinator_handlers.cpp @@ -130,30 +130,13 @@ void CoordinatorHandlers::PromoteReplicaToMainHandler(DbmsHandler &dbms_handler, } auto &instance_client_ref = *instance_client.GetValue(); + // TODO: (andi) Policy for register all databases + // Will be resolved after deciding about choosing new replica const bool all_clients_good = memgraph::dbms::RegisterAllDatabasesClients(dbms_handler, instance_client_ref); - - if (!all_clients_good) { - spdlog::error( - "Failed to register one or more databases to the REPLICA \"{}\". Failover aborted, instance will continue to " - "operate as replica.", - config.name); - - dbms_handler.ForEach([&](DatabaseAccess db_acc) { - auto *storage = db_acc->storage(); - storage->repl_storage_state_.replication_clients_.WithLock([](auto &clients) { clients.clear(); }); - }); - - std::get<replication::RoleMainData>(dbms_handler.ReplicationState().ReplicationData()) - .registered_replicas_.clear(); - - repl_state.SetReplicationRoleReplica(repl_server_config); - - slk::Save(coordination::PromoteReplicaToMainRes{false}, res_builder); - return; - } + MG_ASSERT(all_clients_good, "Failed to register one or more databases to the REPLICA \"{}\".", config.name); StartReplicaClient(dbms_handler, instance_client_ref); - }; + } slk::Save(coordination::PromoteReplicaToMainRes{true}, res_builder); } diff --git a/src/query/interpreter.cpp b/src/query/interpreter.cpp index 301a0b592..1601e8155 100644 --- a/src/query/interpreter.cpp +++ b/src/query/interpreter.cpp @@ -528,6 +528,26 @@ class CoordQueryHandler final : public query::CoordinatorQueryHandler { } } + /// @throw QueryRuntimeException if an error ocurred. + void DoFailover() const override { + if (!FLAGS_coordinator) { + throw QueryRuntimeException("Only coordinator can register coordinator server!"); + } + + auto status = coordinator_handler_.DoFailover(); + switch (status) { + using enum memgraph::coordination::DoFailoverStatus; + case ALL_REPLICAS_DOWN: + throw QueryRuntimeException("Failover aborted since all replicas are down!"); + case MAIN_ALIVE: + throw QueryRuntimeException("Failover aborted since main is alive!"); + case RPC_FAILED: + throw QueryRuntimeException("Failover aborted since promoting replica to main failed!"); + case SUCCESS: + break; + } + } + #endif #ifdef MG_ENTERPRISE diff --git a/src/storage/v2/replication/replication_client.cpp b/src/storage/v2/replication/replication_client.cpp index 5b29e8b15..b68618e04 100644 --- a/src/storage/v2/replication/replication_client.cpp +++ b/src/storage/v2/replication/replication_client.cpp @@ -67,6 +67,7 @@ void ReplicationStorageClient::UpdateReplicaState(Storage *storage, DatabaseAcce "now hold unique data. Please resolve data conflicts and start the " "replication on a clean instance.", client_.name_, client_.name_, client_.name_); + // TODO: (andi) Talk about renaming MAYBE_BEHIND to branching // State not updated, hence in MAYBE_BEHIND state return; } diff --git a/tests/e2e/high_availability_experimental/CMakeLists.txt b/tests/e2e/high_availability_experimental/CMakeLists.txt index be7ff294f..a1f0f742c 100644 --- a/tests/e2e/high_availability_experimental/CMakeLists.txt +++ b/tests/e2e/high_availability_experimental/CMakeLists.txt @@ -1,9 +1,8 @@ find_package(gflags REQUIRED) copy_e2e_python_files(ha_experimental coordinator.py) -copy_e2e_python_files(ha_experimental client_initiated_failover.py) +copy_e2e_python_files(ha_experimental show_replication_cluster.py) copy_e2e_python_files(ha_experimental automatic_failover.py) -copy_e2e_python_files(ha_experimental uninitialized_cluster.py) copy_e2e_python_files(ha_experimental common.py) copy_e2e_python_files(ha_experimental conftest.py) copy_e2e_python_files(ha_experimental workloads.yaml) diff --git a/tests/e2e/high_availability_experimental/automatic_failover.py b/tests/e2e/high_availability_experimental/automatic_failover.py index 46254ed21..07eebd4bf 100644 --- a/tests/e2e/high_availability_experimental/automatic_failover.py +++ b/tests/e2e/high_availability_experimental/automatic_failover.py @@ -76,7 +76,7 @@ def test_simple_automatic_failover(connection): expected_data_on_coord = [ ("instance_1", "127.0.0.1:10011", True, "main"), ("instance_2", "127.0.0.1:10012", True, "replica"), - ("instance_3", "127.0.0.1:10013", False, "replica"), + ("instance_3", "127.0.0.1:10013", False, "main"), ] mg_sleep_and_assert(expected_data_on_coord, retrieve_data_show_repl_cluster) diff --git a/tests/e2e/high_availability_experimental/coordinator.py b/tests/e2e/high_availability_experimental/coordinator.py index 711ccb8d0..2c16813a5 100644 --- a/tests/e2e/high_availability_experimental/coordinator.py +++ b/tests/e2e/high_availability_experimental/coordinator.py @@ -83,16 +83,5 @@ def test_main_and_replicas_cannot_register_coord_server(port, role, connection): assert str(e.value) == "Only coordinator can register coordinator server!" -@pytest.mark.parametrize( - "port, role", - [(7687, "main"), (7688, "replica"), (7689, "replica")], -) -def test_main_and_replicas_cannot_run_do_failover(port, role, connection): - cursor = connection(port, role).cursor() - with pytest.raises(Exception) as e: - execute_and_fetch_all(cursor, "DO FAILOVER;") - assert str(e.value) == "Only coordinator can run DO FAILOVER!" - - if __name__ == "__main__": sys.exit(pytest.main([__file__, "-rA"])) diff --git a/tests/e2e/high_availability_experimental/client_initiated_failover.py b/tests/e2e/high_availability_experimental/show_replication_cluster.py similarity index 53% rename from tests/e2e/high_availability_experimental/client_initiated_failover.py rename to tests/e2e/high_availability_experimental/show_replication_cluster.py index 96b333534..aaa4d4414 100644 --- a/tests/e2e/high_availability_experimental/client_initiated_failover.py +++ b/tests/e2e/high_availability_experimental/show_replication_cluster.py @@ -101,107 +101,5 @@ def test_show_replication_cluster(connection): mg_sleep_and_assert(expected_data, retrieve_data) -def test_simple_client_initiated_failover(connection): - # 1. Start all instances - # 2. Kill main - # 3. Run DO FAILOVER on COORDINATOR - # 4. Assert new config on coordinator by running show replication cluster - # 5. Assert replicas on new main - - # 1. - interactive_mg_runner.start_all(MEMGRAPH_INSTANCES_DESCRIPTION) - - # 2. - main_cursor = connection(7687, "instance_3").cursor() - expected_data_on_main = [ - ("instance_1", "127.0.0.1:10001", "sync", 0, 0, "ready"), - ("instance_2", "127.0.0.1:10002", "sync", 0, 0, "ready"), - ] - actual_data_on_main = sorted(list(execute_and_fetch_all(main_cursor, "SHOW REPLICAS;"))) - assert actual_data_on_main == expected_data_on_main - - interactive_mg_runner.kill(MEMGRAPH_INSTANCES_DESCRIPTION, "instance_3") - coord_cursor = connection(7690, "coordinator").cursor() - - def retrieve_data_show_repl_cluster(): - return sorted(list(execute_and_fetch_all(coord_cursor, "SHOW REPLICATION CLUSTER;"))) - - expected_data_on_coord = [ - ("instance_1", "127.0.0.1:10011", True, "replica"), - ("instance_2", "127.0.0.1:10012", True, "replica"), - ("instance_3", "127.0.0.1:10013", False, "main"), - ] - mg_sleep_and_assert(expected_data_on_coord, retrieve_data_show_repl_cluster) - - # 3. - execute_and_fetch_all(coord_cursor, "DO FAILOVER") - - expected_data_on_coord = [ - ("instance_1", "127.0.0.1:10011", True, "main"), - ("instance_2", "127.0.0.1:10012", True, "replica"), - ] - mg_sleep_and_assert(expected_data_on_coord, retrieve_data_show_repl_cluster) - - # 4. - new_main_cursor = connection(7688, "instance_1").cursor() - - def retrieve_data_show_replicas(): - return sorted(list(execute_and_fetch_all(new_main_cursor, "SHOW REPLICAS;"))) - - expected_data_on_new_main = [ - ("instance_2", "127.0.0.1:10002", "sync", 0, 0, "ready"), - ] - mg_sleep_and_assert(expected_data_on_new_main, retrieve_data_show_replicas) - - -def test_failover_fails_all_replicas_down(connection): - interactive_mg_runner.start_all(MEMGRAPH_INSTANCES_DESCRIPTION) - - interactive_mg_runner.kill(MEMGRAPH_INSTANCES_DESCRIPTION, "instance_1") - interactive_mg_runner.kill(MEMGRAPH_INSTANCES_DESCRIPTION, "instance_2") - interactive_mg_runner.kill(MEMGRAPH_INSTANCES_DESCRIPTION, "instance_3") - - coord_cursor = connection(7690, "coordinator").cursor() - - def retrieve_data(): - return sorted(list(execute_and_fetch_all(coord_cursor, "SHOW REPLICATION CLUSTER;"))) - - expected_data_on_coord = [ - ("instance_1", "127.0.0.1:10011", False, "replica"), - ("instance_2", "127.0.0.1:10012", False, "replica"), - ("instance_3", "127.0.0.1:10013", False, "main"), - ] - mg_sleep_and_assert(expected_data_on_coord, retrieve_data) - - # 4. - with pytest.raises(Exception) as e: - execute_and_fetch_all(coord_cursor, "DO FAILOVER;") - assert str(e.value) == "Failover aborted since all replicas are down!" - - mg_sleep_and_assert(expected_data_on_coord, retrieve_data) - - -def test_failover_fails_main_is_alive(connection): - interactive_mg_runner.start_all(MEMGRAPH_INSTANCES_DESCRIPTION) - - coord_cursor = connection(7690, "coordinator").cursor() - - def retrieve_data(): - return sorted(list(execute_and_fetch_all(coord_cursor, "SHOW REPLICATION CLUSTER;"))) - - expected_data_on_coord = [ - ("instance_1", "127.0.0.1:10011", True, "replica"), - ("instance_2", "127.0.0.1:10012", True, "replica"), - ("instance_3", "127.0.0.1:10013", True, "main"), - ] - mg_sleep_and_assert(expected_data_on_coord, retrieve_data) - - with pytest.raises(Exception) as e: - execute_and_fetch_all(coord_cursor, "DO FAILOVER;") - assert str(e.value) == "Failover aborted since main is alive!" - - mg_sleep_and_assert(expected_data_on_coord, retrieve_data) - - if __name__ == "__main__": sys.exit(pytest.main([__file__, "-rA"])) diff --git a/tests/e2e/high_availability_experimental/uninitialized_cluster.py b/tests/e2e/high_availability_experimental/uninitialized_cluster.py deleted file mode 100644 index 6c0f77bb7..000000000 --- a/tests/e2e/high_availability_experimental/uninitialized_cluster.py +++ /dev/null @@ -1,26 +0,0 @@ -# Copyright 2022 Memgraph Ltd. -# -# Use of this software is governed by the Business Source License -# included in the file licenses/BSL.txt; by using this file, you agree to be bound by the terms of the Business Source -# License, and you may not use this file except in compliance with the Business Source License. -# -# As of the Change Date specified in that file, in accordance with -# the Business Source License, use of this software will be governed -# by the Apache License, Version 2.0, included in the file -# licenses/APL.txt. - -import sys - -import pytest -from common import execute_and_fetch_all - - -def test_failover_on_non_setup_cluster(connection): - cursor = connection(7690, "coordinator").cursor() - with pytest.raises(Exception) as e: - execute_and_fetch_all(cursor, "DO FAILOVER;") - assert str(e.value) == "Failover aborted since cluster is uninitialized!" - - -if __name__ == "__main__": - sys.exit(pytest.main([__file__, "-rA"])) diff --git a/tests/e2e/high_availability_experimental/workloads.yaml b/tests/e2e/high_availability_experimental/workloads.yaml index 11d5d75d9..f0b9a40ec 100644 --- a/tests/e2e/high_availability_experimental/workloads.yaml +++ b/tests/e2e/high_availability_experimental/workloads.yaml @@ -24,28 +24,6 @@ ha_cluster: &ha_cluster "REGISTER REPLICA replica_2 SYNC TO '127.0.0.1:10002' WITH COORDINATOR SERVER ON '127.0.0.1:10012'", ] -noninitialized_cluster: &noninitialized_cluster - cluster: - replica_1: - args: ["--bolt-port", "7688", "--log-level=TRACE", "--coordinator-server-port=10011"] - log_file: "replication-e2e-replica1.log" - setup_queries: ["SET REPLICATION ROLE TO REPLICA WITH PORT 10001;"] - replica_2: - args: ["--bolt-port", "7689", "--log-level=TRACE", "--coordinator-server-port=10012"] - log_file: "replication-e2e-replica2.log" - setup_queries: ["SET REPLICATION ROLE TO REPLICA WITH PORT 10002;"] - main: - args: ["--bolt-port", "7687", "--log-level=TRACE", "--coordinator-server-port=10013"] - log_file: "replication-e2e-main.log" - setup_queries: [ - "REGISTER REPLICA replica_1 SYNC TO '127.0.0.1:10001'", - "REGISTER REPLICA replica_2 SYNC TO '127.0.0.1:10002'", - ] - coordinator: - args: ["--bolt-port", "7690", "--log-level=TRACE", "--coordinator"] - log_file: "replication-e2e-coordinator.log" - setup_queries: [] - workloads: - name: "Coordinator" @@ -53,14 +31,9 @@ workloads: args: ["high_availability_experimental/coordinator.py"] <<: *ha_cluster - - name: "Uninitialized cluster" + - name: "Show replication cluster" binary: "tests/e2e/pytest_runner.sh" - args: ["high_availability_experimental/uninitialized_cluster.py"] - <<: *noninitialized_cluster - - - name: "Client initiated failover" - binary: "tests/e2e/pytest_runner.sh" - args: ["high_availability_experimental/client_initiated_failover.py"] + args: ["high_availability_experimental/show_replication_cluster.py"] - name: "Automatic failover" binary: "tests/e2e/pytest_runner.sh"