Only leader performing callbacks

This commit is contained in:
Andi Skrgat 2024-02-06 09:08:40 +01:00
parent cf80687d1d
commit 6e758d3b5a
9 changed files with 109 additions and 40 deletions

View File

@ -252,20 +252,28 @@ auto CoordinatorData::SetInstanceToMain(std::string instance_name) -> SetInstanc
auto CoordinatorData::RegisterInstance(CoordinatorClientConfig config) -> RegisterInstanceCoordinatorStatus {
auto lock = std::lock_guard{coord_data_lock_};
if (std::ranges::any_of(repl_instances_, [&config](ReplicationInstance const &instance) {
return instance.InstanceName() == config.instance_name;
})) {
auto const name_matches = [&config](ReplicationInstance const &instance) {
return instance.InstanceName() == config.instance_name;
};
if (std::ranges::any_of(repl_instances_, name_matches)) {
return RegisterInstanceCoordinatorStatus::NAME_EXISTS;
}
if (std::ranges::any_of(repl_instances_, [&config](ReplicationInstance const &instance) {
return instance.SocketAddress() == config.SocketAddress();
})) {
auto const socket_address_matches = [&config](ReplicationInstance const &instance) {
return instance.SocketAddress() == config.SocketAddress();
};
if (std::ranges::any_of(repl_instances_, socket_address_matches)) {
return RegisterInstanceCoordinatorStatus::ENDPOINT_EXISTS;
}
try {
repl_instances_.emplace_back(this, std::move(config), replica_succ_cb_, replica_fail_cb_);
auto *repl_instance = &repl_instances_.emplace_back(this, std::move(config), replica_succ_cb_, replica_fail_cb_);
if (self_.IsLeader()) {
repl_instance->StartFrequentCheck();
}
return RegisterInstanceCoordinatorStatus::SUCCESS;
} catch (CoordinatorRegisterInstanceException const &) {

View File

@ -35,6 +35,8 @@ CoordinatorInstance::CoordinatorInstance()
state_machine_ = cs_new<CoordinatorStateMachine>();
logger_ = nullptr;
// TODO: (andi) Maybe params file
// ASIO options
asio_service::options asio_opts;
asio_opts.thread_pool_size_ = 1; // TODO: (andi) Improve this
@ -94,5 +96,7 @@ auto CoordinatorInstance::GetAllCoordinators() const -> std::vector<ptr<srv_conf
return all_srv_configs;
}
auto CoordinatorInstance::IsLeader() const -> bool { return raft_server_->is_leader(); }
} // namespace memgraph::coordination
#endif

View File

@ -38,9 +38,12 @@ class CoordinatorInstance {
auto InstanceName() const -> std::string;
auto RaftSocketAddress() const -> std::string;
auto AddCoordinatorInstance(uint32_t raft_server_id, uint32_t raft_port, std::string raft_address) -> void;
auto GetAllCoordinators() const -> std::vector<ptr<srv_config>>;
auto IsLeader() const -> bool;
private:
ptr<state_machine> state_machine_;
ptr<state_mgr> state_manager_;

View File

@ -51,6 +51,7 @@ class ReplicationInstance {
HealthCheckCallback main_fail_cb) -> bool;
auto DemoteToReplica(HealthCheckCallback replica_succ_cb, HealthCheckCallback replica_fail_cb) -> bool;
auto StartFrequentCheck() -> void;
auto PauseFrequentCheck() -> void;
auto ResumeFrequentCheck() -> void;

View File

@ -25,7 +25,6 @@ ReplicationInstance::ReplicationInstance(CoordinatorData *data, CoordinatorClien
if (!client_.DemoteToReplica()) {
throw CoordinatorRegisterInstanceException("Failed to demote instance {} to replica", client_.InstanceName());
}
client_.StartFrequentCheck();
}
auto ReplicationInstance::OnSuccessPing() -> void {
@ -75,6 +74,7 @@ auto ReplicationInstance::DemoteToReplica(HealthCheckCallback replica_succ_cb, H
return true;
}
auto ReplicationInstance::StartFrequentCheck() -> void { client_.StartFrequentCheck(); }
auto ReplicationInstance::PauseFrequentCheck() -> void { client_.PauseFrequentCheck(); }
auto ReplicationInstance::ResumeFrequentCheck() -> void { client_.ResumeFrequentCheck(); }

View File

@ -1,7 +1,7 @@
find_package(gflags REQUIRED)
copy_e2e_python_files(ha_experimental coordinator.py)
copy_e2e_python_files(ha_experimental automatic_failover.py)
copy_e2e_python_files(ha_experimental single_coordinator.py)
copy_e2e_python_files(ha_experimental distributed_coordinators.py)
copy_e2e_python_files(ha_experimental manual_setting_replicas.py)
copy_e2e_python_files(ha_experimental not_replicate_from_old_main.py)

View File

@ -29,10 +29,49 @@ interactive_mg_runner.MEMGRAPH_BINARY = os.path.normpath(os.path.join(interactiv
TEMP_DIR = tempfile.TemporaryDirectory().name
MEMGRAPH_INSTANCES_DESCRIPTION = {
"coordinator1": {
"instance_1": {
"args": [
"--bolt-port",
"7687",
"--log-level",
"TRACE",
"--coordinator-server-port",
"10011",
],
"log_file": "instance_1.log",
"data_directory": f"{TEMP_DIR}/instance_1",
"setup_queries": [],
},
"instance_2": {
"args": [
"--bolt-port",
"7688",
"--log-level",
"TRACE",
"--coordinator-server-port",
"10012",
],
"log_file": "instance_2.log",
"data_directory": f"{TEMP_DIR}/instance_2",
"setup_queries": [],
},
"instance_3": {
"args": [
"--bolt-port",
"7689",
"--log-level",
"TRACE",
"--coordinator-server-port",
"10013",
],
"log_file": "instance_3.log",
"data_directory": f"{TEMP_DIR}/instance_3",
"setup_queries": [],
},
"coordinator_1": {
"args": [
"--bolt-port",
"7690",
"--log-level=TRACE",
"--raft-server-id=1",
"--raft-server-port=10111",
@ -40,10 +79,10 @@ MEMGRAPH_INSTANCES_DESCRIPTION = {
"log_file": "coordinator1.log",
"setup_queries": [],
},
"coordinator2": {
"coordinator_2": {
"args": [
"--bolt-port",
"7688",
"7691",
"--log-level=TRACE",
"--raft-server-id=2",
"--raft-server-port=10112",
@ -51,10 +90,10 @@ MEMGRAPH_INSTANCES_DESCRIPTION = {
"log_file": "coordinator2.log",
"setup_queries": [],
},
"coordinator3": {
"coordinator_3": {
"args": [
"--bolt-port",
"7689",
"7692",
"--log-level=TRACE",
"--raft-server-id=3",
"--raft-server-port=10113",
@ -63,6 +102,10 @@ MEMGRAPH_INSTANCES_DESCRIPTION = {
"setup_queries": [
"ADD COORDINATOR 1 ON '127.0.0.1:10111'",
"ADD COORDINATOR 2 ON '127.0.0.1:10112'",
"REGISTER INSTANCE instance_1 ON '127.0.0.1:10011' WITH '127.0.0.1:10001';",
"REGISTER INSTANCE instance_2 ON '127.0.0.1:10012' WITH '127.0.0.1:10002';",
"REGISTER INSTANCE instance_3 ON '127.0.0.1:10013' WITH '127.0.0.1:10003';",
"SET INSTANCE instance_3 TO MAIN",
],
},
}
@ -72,73 +115,83 @@ def test_coordinators_communication():
safe_execute(shutil.rmtree, TEMP_DIR)
interactive_mg_runner.start_all(MEMGRAPH_INSTANCES_DESCRIPTION)
coordinator3_cursor = connect(host="localhost", port=7689).cursor()
coordinator3_cursor = connect(host="localhost", port=7692).cursor()
def check_coordinator3():
return sorted(list(execute_and_fetch_all(coordinator3_cursor, "SHOW INSTANCES")))
expected_cluster = [
expected_cluster_coord3 = [
("coordinator_1", "127.0.0.1:10111", "", True, "coordinator"),
("coordinator_2", "127.0.0.1:10112", "", True, "coordinator"),
("coordinator_3", "127.0.0.1:10113", "", True, "coordinator"),
("instance_1", "", "127.0.0.1:10011", True, "replica"),
("instance_2", "", "127.0.0.1:10012", True, "replica"),
("instance_3", "", "127.0.0.1:10013", True, "main"),
]
mg_sleep_and_assert(expected_cluster, check_coordinator3)
mg_sleep_and_assert(expected_cluster_coord3, check_coordinator3)
coordinator1_cursor = connect(host="localhost", port=7687).cursor()
coordinator1_cursor = connect(host="localhost", port=7690).cursor()
def check_coordinator1():
return sorted(list(execute_and_fetch_all(coordinator1_cursor, "SHOW INSTANCES")))
mg_sleep_and_assert(expected_cluster, check_coordinator1)
# TODO: (andi) This should be solved eventually
expected_cluster_not_shared = [
("coordinator_1", "127.0.0.1:10111", "", True, "coordinator"),
("coordinator_2", "127.0.0.1:10112", "", True, "coordinator"),
("coordinator_3", "127.0.0.1:10113", "", True, "coordinator"),
]
coordinator2_cursor = connect(host="localhost", port=7688).cursor()
mg_sleep_and_assert(expected_cluster_not_shared, check_coordinator1)
coordinator2_cursor = connect(host="localhost", port=7691).cursor()
def check_coordinator2():
return sorted(list(execute_and_fetch_all(coordinator2_cursor, "SHOW INSTANCES")))
mg_sleep_and_assert(expected_cluster, check_coordinator2)
mg_sleep_and_assert(expected_cluster_not_shared, check_coordinator2)
def test_coordinators_communication_with_restarts():
safe_execute(shutil.rmtree, TEMP_DIR)
interactive_mg_runner.start_all(MEMGRAPH_INSTANCES_DESCRIPTION)
expected_cluster = [
expected_cluster_not_shared = [
("coordinator_1", "127.0.0.1:10111", "", True, "coordinator"),
("coordinator_2", "127.0.0.1:10112", "", True, "coordinator"),
("coordinator_3", "127.0.0.1:10113", "", True, "coordinator"),
]
coordinator1_cursor = connect(host="localhost", port=7687).cursor()
coordinator1_cursor = connect(host="localhost", port=7690).cursor()
def check_coordinator1():
return sorted(list(execute_and_fetch_all(coordinator1_cursor, "SHOW INSTANCES")))
mg_sleep_and_assert(expected_cluster, check_coordinator1)
mg_sleep_and_assert(expected_cluster_not_shared, check_coordinator1)
coordinator2_cursor = connect(host="localhost", port=7688).cursor()
coordinator2_cursor = connect(host="localhost", port=7691).cursor()
def check_coordinator2():
return sorted(list(execute_and_fetch_all(coordinator2_cursor, "SHOW INSTANCES")))
mg_sleep_and_assert(expected_cluster, check_coordinator2)
mg_sleep_and_assert(expected_cluster_not_shared, check_coordinator2)
interactive_mg_runner.kill(MEMGRAPH_INSTANCES_DESCRIPTION, "coordinator1")
interactive_mg_runner.start(MEMGRAPH_INSTANCES_DESCRIPTION, "coordinator1")
coordinator1_cursor = connect(host="localhost", port=7687).cursor()
interactive_mg_runner.kill(MEMGRAPH_INSTANCES_DESCRIPTION, "coordinator_1")
interactive_mg_runner.start(MEMGRAPH_INSTANCES_DESCRIPTION, "coordinator_1")
coordinator1_cursor = connect(host="localhost", port=7690).cursor()
mg_sleep_and_assert(expected_cluster, check_coordinator1)
mg_sleep_and_assert(expected_cluster_not_shared, check_coordinator1)
interactive_mg_runner.kill(MEMGRAPH_INSTANCES_DESCRIPTION, "coordinator1")
interactive_mg_runner.kill(MEMGRAPH_INSTANCES_DESCRIPTION, "coordinator2")
interactive_mg_runner.kill(MEMGRAPH_INSTANCES_DESCRIPTION, "coordinator_1")
interactive_mg_runner.kill(MEMGRAPH_INSTANCES_DESCRIPTION, "coordinator_2")
interactive_mg_runner.start(MEMGRAPH_INSTANCES_DESCRIPTION, "coordinator1")
interactive_mg_runner.start(MEMGRAPH_INSTANCES_DESCRIPTION, "coordinator2")
coordinator1_cursor = connect(host="localhost", port=7687).cursor()
coordinator2_cursor = connect(host="localhost", port=7688).cursor()
interactive_mg_runner.start(MEMGRAPH_INSTANCES_DESCRIPTION, "coordinator_1")
interactive_mg_runner.start(MEMGRAPH_INSTANCES_DESCRIPTION, "coordinator_2")
coordinator1_cursor = connect(host="localhost", port=7690).cursor()
coordinator2_cursor = connect(host="localhost", port=7691).cursor()
mg_sleep_and_assert(expected_cluster, check_coordinator1)
mg_sleep_and_assert(expected_cluster, check_coordinator2)
mg_sleep_and_assert(expected_cluster_not_shared, check_coordinator1)
mg_sleep_and_assert(expected_cluster_not_shared, check_coordinator2)
if __name__ == "__main__":

View File

@ -28,9 +28,9 @@ workloads:
args: ["high_availability_experimental/coordinator.py"]
<<: *ha_cluster
- name: "Automatic failover"
- name: "Single coordinator"
binary: "tests/e2e/pytest_runner.sh"
args: ["high_availability_experimental/automatic_failover.py"]
args: ["high_availability_experimental/single_coordinator.py"]
- name: "Disabled manual setting of replication cluster"
binary: "tests/e2e/pytest_runner.sh"