Only leader performing callbacks
This commit is contained in:
parent
cf80687d1d
commit
6e758d3b5a
@ -252,20 +252,28 @@ auto CoordinatorData::SetInstanceToMain(std::string instance_name) -> SetInstanc
|
||||
|
||||
auto CoordinatorData::RegisterInstance(CoordinatorClientConfig config) -> RegisterInstanceCoordinatorStatus {
|
||||
auto lock = std::lock_guard{coord_data_lock_};
|
||||
if (std::ranges::any_of(repl_instances_, [&config](ReplicationInstance const &instance) {
|
||||
return instance.InstanceName() == config.instance_name;
|
||||
})) {
|
||||
|
||||
auto const name_matches = [&config](ReplicationInstance const &instance) {
|
||||
return instance.InstanceName() == config.instance_name;
|
||||
};
|
||||
|
||||
if (std::ranges::any_of(repl_instances_, name_matches)) {
|
||||
return RegisterInstanceCoordinatorStatus::NAME_EXISTS;
|
||||
}
|
||||
|
||||
if (std::ranges::any_of(repl_instances_, [&config](ReplicationInstance const &instance) {
|
||||
return instance.SocketAddress() == config.SocketAddress();
|
||||
})) {
|
||||
auto const socket_address_matches = [&config](ReplicationInstance const &instance) {
|
||||
return instance.SocketAddress() == config.SocketAddress();
|
||||
};
|
||||
|
||||
if (std::ranges::any_of(repl_instances_, socket_address_matches)) {
|
||||
return RegisterInstanceCoordinatorStatus::ENDPOINT_EXISTS;
|
||||
}
|
||||
|
||||
try {
|
||||
repl_instances_.emplace_back(this, std::move(config), replica_succ_cb_, replica_fail_cb_);
|
||||
auto *repl_instance = &repl_instances_.emplace_back(this, std::move(config), replica_succ_cb_, replica_fail_cb_);
|
||||
if (self_.IsLeader()) {
|
||||
repl_instance->StartFrequentCheck();
|
||||
}
|
||||
return RegisterInstanceCoordinatorStatus::SUCCESS;
|
||||
|
||||
} catch (CoordinatorRegisterInstanceException const &) {
|
||||
|
@ -35,6 +35,8 @@ CoordinatorInstance::CoordinatorInstance()
|
||||
state_machine_ = cs_new<CoordinatorStateMachine>();
|
||||
logger_ = nullptr;
|
||||
|
||||
// TODO: (andi) Maybe params file
|
||||
|
||||
// ASIO options
|
||||
asio_service::options asio_opts;
|
||||
asio_opts.thread_pool_size_ = 1; // TODO: (andi) Improve this
|
||||
@ -94,5 +96,7 @@ auto CoordinatorInstance::GetAllCoordinators() const -> std::vector<ptr<srv_conf
|
||||
return all_srv_configs;
|
||||
}
|
||||
|
||||
auto CoordinatorInstance::IsLeader() const -> bool { return raft_server_->is_leader(); }
|
||||
|
||||
} // namespace memgraph::coordination
|
||||
#endif
|
||||
|
@ -38,9 +38,12 @@ class CoordinatorInstance {
|
||||
|
||||
auto InstanceName() const -> std::string;
|
||||
auto RaftSocketAddress() const -> std::string;
|
||||
|
||||
auto AddCoordinatorInstance(uint32_t raft_server_id, uint32_t raft_port, std::string raft_address) -> void;
|
||||
auto GetAllCoordinators() const -> std::vector<ptr<srv_config>>;
|
||||
|
||||
auto IsLeader() const -> bool;
|
||||
|
||||
private:
|
||||
ptr<state_machine> state_machine_;
|
||||
ptr<state_mgr> state_manager_;
|
||||
|
@ -51,6 +51,7 @@ class ReplicationInstance {
|
||||
HealthCheckCallback main_fail_cb) -> bool;
|
||||
auto DemoteToReplica(HealthCheckCallback replica_succ_cb, HealthCheckCallback replica_fail_cb) -> bool;
|
||||
|
||||
auto StartFrequentCheck() -> void;
|
||||
auto PauseFrequentCheck() -> void;
|
||||
auto ResumeFrequentCheck() -> void;
|
||||
|
||||
|
@ -25,7 +25,6 @@ ReplicationInstance::ReplicationInstance(CoordinatorData *data, CoordinatorClien
|
||||
if (!client_.DemoteToReplica()) {
|
||||
throw CoordinatorRegisterInstanceException("Failed to demote instance {} to replica", client_.InstanceName());
|
||||
}
|
||||
client_.StartFrequentCheck();
|
||||
}
|
||||
|
||||
auto ReplicationInstance::OnSuccessPing() -> void {
|
||||
@ -75,6 +74,7 @@ auto ReplicationInstance::DemoteToReplica(HealthCheckCallback replica_succ_cb, H
|
||||
return true;
|
||||
}
|
||||
|
||||
auto ReplicationInstance::StartFrequentCheck() -> void { client_.StartFrequentCheck(); }
|
||||
auto ReplicationInstance::PauseFrequentCheck() -> void { client_.PauseFrequentCheck(); }
|
||||
auto ReplicationInstance::ResumeFrequentCheck() -> void { client_.ResumeFrequentCheck(); }
|
||||
|
||||
|
@ -1,7 +1,7 @@
|
||||
find_package(gflags REQUIRED)
|
||||
|
||||
copy_e2e_python_files(ha_experimental coordinator.py)
|
||||
copy_e2e_python_files(ha_experimental automatic_failover.py)
|
||||
copy_e2e_python_files(ha_experimental single_coordinator.py)
|
||||
copy_e2e_python_files(ha_experimental distributed_coordinators.py)
|
||||
copy_e2e_python_files(ha_experimental manual_setting_replicas.py)
|
||||
copy_e2e_python_files(ha_experimental not_replicate_from_old_main.py)
|
||||
|
@ -29,10 +29,49 @@ interactive_mg_runner.MEMGRAPH_BINARY = os.path.normpath(os.path.join(interactiv
|
||||
TEMP_DIR = tempfile.TemporaryDirectory().name
|
||||
|
||||
MEMGRAPH_INSTANCES_DESCRIPTION = {
|
||||
"coordinator1": {
|
||||
"instance_1": {
|
||||
"args": [
|
||||
"--bolt-port",
|
||||
"7687",
|
||||
"--log-level",
|
||||
"TRACE",
|
||||
"--coordinator-server-port",
|
||||
"10011",
|
||||
],
|
||||
"log_file": "instance_1.log",
|
||||
"data_directory": f"{TEMP_DIR}/instance_1",
|
||||
"setup_queries": [],
|
||||
},
|
||||
"instance_2": {
|
||||
"args": [
|
||||
"--bolt-port",
|
||||
"7688",
|
||||
"--log-level",
|
||||
"TRACE",
|
||||
"--coordinator-server-port",
|
||||
"10012",
|
||||
],
|
||||
"log_file": "instance_2.log",
|
||||
"data_directory": f"{TEMP_DIR}/instance_2",
|
||||
"setup_queries": [],
|
||||
},
|
||||
"instance_3": {
|
||||
"args": [
|
||||
"--bolt-port",
|
||||
"7689",
|
||||
"--log-level",
|
||||
"TRACE",
|
||||
"--coordinator-server-port",
|
||||
"10013",
|
||||
],
|
||||
"log_file": "instance_3.log",
|
||||
"data_directory": f"{TEMP_DIR}/instance_3",
|
||||
"setup_queries": [],
|
||||
},
|
||||
"coordinator_1": {
|
||||
"args": [
|
||||
"--bolt-port",
|
||||
"7690",
|
||||
"--log-level=TRACE",
|
||||
"--raft-server-id=1",
|
||||
"--raft-server-port=10111",
|
||||
@ -40,10 +79,10 @@ MEMGRAPH_INSTANCES_DESCRIPTION = {
|
||||
"log_file": "coordinator1.log",
|
||||
"setup_queries": [],
|
||||
},
|
||||
"coordinator2": {
|
||||
"coordinator_2": {
|
||||
"args": [
|
||||
"--bolt-port",
|
||||
"7688",
|
||||
"7691",
|
||||
"--log-level=TRACE",
|
||||
"--raft-server-id=2",
|
||||
"--raft-server-port=10112",
|
||||
@ -51,10 +90,10 @@ MEMGRAPH_INSTANCES_DESCRIPTION = {
|
||||
"log_file": "coordinator2.log",
|
||||
"setup_queries": [],
|
||||
},
|
||||
"coordinator3": {
|
||||
"coordinator_3": {
|
||||
"args": [
|
||||
"--bolt-port",
|
||||
"7689",
|
||||
"7692",
|
||||
"--log-level=TRACE",
|
||||
"--raft-server-id=3",
|
||||
"--raft-server-port=10113",
|
||||
@ -63,6 +102,10 @@ MEMGRAPH_INSTANCES_DESCRIPTION = {
|
||||
"setup_queries": [
|
||||
"ADD COORDINATOR 1 ON '127.0.0.1:10111'",
|
||||
"ADD COORDINATOR 2 ON '127.0.0.1:10112'",
|
||||
"REGISTER INSTANCE instance_1 ON '127.0.0.1:10011' WITH '127.0.0.1:10001';",
|
||||
"REGISTER INSTANCE instance_2 ON '127.0.0.1:10012' WITH '127.0.0.1:10002';",
|
||||
"REGISTER INSTANCE instance_3 ON '127.0.0.1:10013' WITH '127.0.0.1:10003';",
|
||||
"SET INSTANCE instance_3 TO MAIN",
|
||||
],
|
||||
},
|
||||
}
|
||||
@ -72,73 +115,83 @@ def test_coordinators_communication():
|
||||
safe_execute(shutil.rmtree, TEMP_DIR)
|
||||
interactive_mg_runner.start_all(MEMGRAPH_INSTANCES_DESCRIPTION)
|
||||
|
||||
coordinator3_cursor = connect(host="localhost", port=7689).cursor()
|
||||
coordinator3_cursor = connect(host="localhost", port=7692).cursor()
|
||||
|
||||
def check_coordinator3():
|
||||
return sorted(list(execute_and_fetch_all(coordinator3_cursor, "SHOW INSTANCES")))
|
||||
|
||||
expected_cluster = [
|
||||
expected_cluster_coord3 = [
|
||||
("coordinator_1", "127.0.0.1:10111", "", True, "coordinator"),
|
||||
("coordinator_2", "127.0.0.1:10112", "", True, "coordinator"),
|
||||
("coordinator_3", "127.0.0.1:10113", "", True, "coordinator"),
|
||||
("instance_1", "", "127.0.0.1:10011", True, "replica"),
|
||||
("instance_2", "", "127.0.0.1:10012", True, "replica"),
|
||||
("instance_3", "", "127.0.0.1:10013", True, "main"),
|
||||
]
|
||||
mg_sleep_and_assert(expected_cluster, check_coordinator3)
|
||||
mg_sleep_and_assert(expected_cluster_coord3, check_coordinator3)
|
||||
|
||||
coordinator1_cursor = connect(host="localhost", port=7687).cursor()
|
||||
coordinator1_cursor = connect(host="localhost", port=7690).cursor()
|
||||
|
||||
def check_coordinator1():
|
||||
return sorted(list(execute_and_fetch_all(coordinator1_cursor, "SHOW INSTANCES")))
|
||||
|
||||
mg_sleep_and_assert(expected_cluster, check_coordinator1)
|
||||
# TODO: (andi) This should be solved eventually
|
||||
expected_cluster_not_shared = [
|
||||
("coordinator_1", "127.0.0.1:10111", "", True, "coordinator"),
|
||||
("coordinator_2", "127.0.0.1:10112", "", True, "coordinator"),
|
||||
("coordinator_3", "127.0.0.1:10113", "", True, "coordinator"),
|
||||
]
|
||||
|
||||
coordinator2_cursor = connect(host="localhost", port=7688).cursor()
|
||||
mg_sleep_and_assert(expected_cluster_not_shared, check_coordinator1)
|
||||
|
||||
coordinator2_cursor = connect(host="localhost", port=7691).cursor()
|
||||
|
||||
def check_coordinator2():
|
||||
return sorted(list(execute_and_fetch_all(coordinator2_cursor, "SHOW INSTANCES")))
|
||||
|
||||
mg_sleep_and_assert(expected_cluster, check_coordinator2)
|
||||
mg_sleep_and_assert(expected_cluster_not_shared, check_coordinator2)
|
||||
|
||||
|
||||
def test_coordinators_communication_with_restarts():
|
||||
safe_execute(shutil.rmtree, TEMP_DIR)
|
||||
interactive_mg_runner.start_all(MEMGRAPH_INSTANCES_DESCRIPTION)
|
||||
|
||||
expected_cluster = [
|
||||
expected_cluster_not_shared = [
|
||||
("coordinator_1", "127.0.0.1:10111", "", True, "coordinator"),
|
||||
("coordinator_2", "127.0.0.1:10112", "", True, "coordinator"),
|
||||
("coordinator_3", "127.0.0.1:10113", "", True, "coordinator"),
|
||||
]
|
||||
|
||||
coordinator1_cursor = connect(host="localhost", port=7687).cursor()
|
||||
coordinator1_cursor = connect(host="localhost", port=7690).cursor()
|
||||
|
||||
def check_coordinator1():
|
||||
return sorted(list(execute_and_fetch_all(coordinator1_cursor, "SHOW INSTANCES")))
|
||||
|
||||
mg_sleep_and_assert(expected_cluster, check_coordinator1)
|
||||
mg_sleep_and_assert(expected_cluster_not_shared, check_coordinator1)
|
||||
|
||||
coordinator2_cursor = connect(host="localhost", port=7688).cursor()
|
||||
coordinator2_cursor = connect(host="localhost", port=7691).cursor()
|
||||
|
||||
def check_coordinator2():
|
||||
return sorted(list(execute_and_fetch_all(coordinator2_cursor, "SHOW INSTANCES")))
|
||||
|
||||
mg_sleep_and_assert(expected_cluster, check_coordinator2)
|
||||
mg_sleep_and_assert(expected_cluster_not_shared, check_coordinator2)
|
||||
|
||||
interactive_mg_runner.kill(MEMGRAPH_INSTANCES_DESCRIPTION, "coordinator1")
|
||||
interactive_mg_runner.start(MEMGRAPH_INSTANCES_DESCRIPTION, "coordinator1")
|
||||
coordinator1_cursor = connect(host="localhost", port=7687).cursor()
|
||||
interactive_mg_runner.kill(MEMGRAPH_INSTANCES_DESCRIPTION, "coordinator_1")
|
||||
interactive_mg_runner.start(MEMGRAPH_INSTANCES_DESCRIPTION, "coordinator_1")
|
||||
coordinator1_cursor = connect(host="localhost", port=7690).cursor()
|
||||
|
||||
mg_sleep_and_assert(expected_cluster, check_coordinator1)
|
||||
mg_sleep_and_assert(expected_cluster_not_shared, check_coordinator1)
|
||||
|
||||
interactive_mg_runner.kill(MEMGRAPH_INSTANCES_DESCRIPTION, "coordinator1")
|
||||
interactive_mg_runner.kill(MEMGRAPH_INSTANCES_DESCRIPTION, "coordinator2")
|
||||
interactive_mg_runner.kill(MEMGRAPH_INSTANCES_DESCRIPTION, "coordinator_1")
|
||||
interactive_mg_runner.kill(MEMGRAPH_INSTANCES_DESCRIPTION, "coordinator_2")
|
||||
|
||||
interactive_mg_runner.start(MEMGRAPH_INSTANCES_DESCRIPTION, "coordinator1")
|
||||
interactive_mg_runner.start(MEMGRAPH_INSTANCES_DESCRIPTION, "coordinator2")
|
||||
coordinator1_cursor = connect(host="localhost", port=7687).cursor()
|
||||
coordinator2_cursor = connect(host="localhost", port=7688).cursor()
|
||||
interactive_mg_runner.start(MEMGRAPH_INSTANCES_DESCRIPTION, "coordinator_1")
|
||||
interactive_mg_runner.start(MEMGRAPH_INSTANCES_DESCRIPTION, "coordinator_2")
|
||||
coordinator1_cursor = connect(host="localhost", port=7690).cursor()
|
||||
coordinator2_cursor = connect(host="localhost", port=7691).cursor()
|
||||
|
||||
mg_sleep_and_assert(expected_cluster, check_coordinator1)
|
||||
mg_sleep_and_assert(expected_cluster, check_coordinator2)
|
||||
mg_sleep_and_assert(expected_cluster_not_shared, check_coordinator1)
|
||||
mg_sleep_and_assert(expected_cluster_not_shared, check_coordinator2)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
@ -28,9 +28,9 @@ workloads:
|
||||
args: ["high_availability_experimental/coordinator.py"]
|
||||
<<: *ha_cluster
|
||||
|
||||
- name: "Automatic failover"
|
||||
- name: "Single coordinator"
|
||||
binary: "tests/e2e/pytest_runner.sh"
|
||||
args: ["high_availability_experimental/automatic_failover.py"]
|
||||
args: ["high_availability_experimental/single_coordinator.py"]
|
||||
|
||||
- name: "Disabled manual setting of replication cluster"
|
||||
binary: "tests/e2e/pytest_runner.sh"
|
||||
|
Loading…
Reference in New Issue
Block a user