memgraph/tests/e2e/replication/show_while_creating_invalid_state.py
Jeremy B f629de7e60
Save replication settings (#415)
* Storage takes care of the saving of setting when a new replica is added

* Restore replicas at startup

* Modify interactive_mg_runner + memgraph to support that data-directory can be configured in CONTEXT

* Extend e2e test

* Correct typo

* Add flag to config to specify when replication should be stored (true by default when starting Memgraph)

* Remove un-necessary "--" in yaml file

* Make sure Memgraph stops if a replica can't be restored.

* Add UT covering the parsing  of ReplicaStatus to/from json

* Add assert in e2e script to check that a port is free before using it

* Add test covering crash on Jepsen

* Make sure applciaiton crashes if it starts on corrupted replications' info

Starting with a non-reponsive replica is allowed.

* Add temporary startup flag: this is needed so jepsen do not automatically restore replica on startup of main. This will be removed in T0835
2022-07-07 13:30:28 +02:00

402 lines
16 KiB
Python

# Copyright 2022 Memgraph Ltd.
#
# Use of this software is governed by the Business Source License
# included in the file licenses/BSL.txt; by using this file, you agree to be bound by the terms of the Business Source
# License, and you may not use this file except in compliance with the Business Source License.
#
# As of the Change Date specified in that file, in accordance with
# the Business Source License, use of this software will be governed
# by the Apache License, Version 2.0, included in the file
# licenses/APL.txt.
import sys
import os
import pytest
import time
from common import execute_and_fetch_all
import interactive_mg_runner
import mgclient
import tempfile
interactive_mg_runner.SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__))
interactive_mg_runner.PROJECT_DIR = os.path.normpath(
os.path.join(interactive_mg_runner.SCRIPT_DIR, "..", "..", "..", "..")
)
interactive_mg_runner.BUILD_DIR = os.path.normpath(os.path.join(interactive_mg_runner.PROJECT_DIR, "build"))
interactive_mg_runner.MEMGRAPH_BINARY = os.path.normpath(os.path.join(interactive_mg_runner.BUILD_DIR, "memgraph"))
MEMGRAPH_INSTANCES_DESCRIPTION = {
"replica_1": {
"args": ["--bolt-port", "7688", "--log-level=TRACE"],
"log_file": "replica1.log",
"setup_queries": ["SET REPLICATION ROLE TO REPLICA WITH PORT 10001;"],
},
"replica_2": {
"args": ["--bolt-port", "7689", "--log-level=TRACE"],
"log_file": "replica2.log",
"setup_queries": ["SET REPLICATION ROLE TO REPLICA WITH PORT 10002;"],
},
"replica_3": {
"args": ["--bolt-port", "7690", "--log-level=TRACE"],
"log_file": "replica3.log",
"setup_queries": ["SET REPLICATION ROLE TO REPLICA WITH PORT 10003;"],
},
"replica_4": {
"args": ["--bolt-port", "7691", "--log-level=TRACE"],
"log_file": "replica4.log",
"setup_queries": ["SET REPLICATION ROLE TO REPLICA WITH PORT 10004;"],
},
"main": {
"args": ["--bolt-port", "7687", "--log-level=TRACE"],
"log_file": "main.log",
"setup_queries": [
"REGISTER REPLICA replica_1 SYNC TO '127.0.0.1:10001';",
"REGISTER REPLICA replica_2 SYNC TO '127.0.0.1:10002';",
"REGISTER REPLICA replica_3 ASYNC TO '127.0.0.1:10003';",
"REGISTER REPLICA replica_4 ASYNC TO '127.0.0.1:10004';",
],
},
}
def test_show_replicas(connection):
# Goal of this test is to check the SHOW REPLICAS command.
# 0/ We start all replicas manually: we want to be able to kill them ourselves without relying on external tooling to kill processes.
# 1/ We check that all replicas have the correct state: they should all be ready.
# 2/ We drop one replica. It should not appear anymore in the SHOW REPLICAS command.
# 3/ We kill another replica. It should become invalid in the SHOW REPLICAS command.
# 0/
interactive_mg_runner.start_all(MEMGRAPH_INSTANCES_DESCRIPTION)
cursor = connection(7687, "main").cursor()
# 1/
actual_data = set(execute_and_fetch_all(cursor, "SHOW REPLICAS;"))
EXPECTED_COLUMN_NAMES = {
"name",
"socket_address",
"sync_mode",
"current_timestamp_of_replica",
"number_of_timestamp_behind_master",
"state",
}
actual_column_names = {x.name for x in cursor.description}
assert actual_column_names == EXPECTED_COLUMN_NAMES
expected_data = {
("replica_1", "127.0.0.1:10001", "sync", 0, 0, "ready"),
("replica_2", "127.0.0.1:10002", "sync", 0, 0, "ready"),
("replica_3", "127.0.0.1:10003", "async", 0, 0, "ready"),
("replica_4", "127.0.0.1:10004", "async", 0, 0, "ready"),
}
assert actual_data == expected_data
# 2/
execute_and_fetch_all(cursor, "DROP REPLICA replica_2")
actual_data = set(execute_and_fetch_all(cursor, "SHOW REPLICAS;"))
expected_data = {
("replica_1", "127.0.0.1:10001", "sync", 0, 0, "ready"),
("replica_3", "127.0.0.1:10003", "async", 0, 0, "ready"),
("replica_4", "127.0.0.1:10004", "async", 0, 0, "ready"),
}
assert actual_data == expected_data
# 3/
interactive_mg_runner.kill(MEMGRAPH_INSTANCES_DESCRIPTION, "replica_1")
interactive_mg_runner.kill(MEMGRAPH_INSTANCES_DESCRIPTION, "replica_3")
interactive_mg_runner.stop(MEMGRAPH_INSTANCES_DESCRIPTION, "replica_4")
# We leave some time for the main to realise the replicas are down.
time.sleep(2)
actual_data = set(execute_and_fetch_all(cursor, "SHOW REPLICAS;"))
expected_data = {
("replica_1", "127.0.0.1:10001", "sync", 0, 0, "invalid"),
("replica_3", "127.0.0.1:10003", "async", 0, 0, "invalid"),
("replica_4", "127.0.0.1:10004", "async", 0, 0, "invalid"),
}
assert actual_data == expected_data
def test_basic_recovery(connection):
# Goal of this test is to check the recovery of main.
# 0/ We start all replicas manually: we want to be able to kill them ourselves without relying on external tooling to kill processes.
# 1/ We check that all replicas have the correct state: they should all be ready.
# 2/ We kill main.
# 3/ We re-start main.
# 4/ We check that all replicas have the correct state: they should all be ready.
# 5/ Drop one replica.
# 6/ We add some data to main, then kill it and restart.
# 7/ We check that all replicas but one have the expected data.
# 8/ We kill another replica.
# 9/ We add some data to main.
# 10/ We re-add the two replicas droped/killed and check the data.
# 11/ We kill another replica.
# 12/ Add some more data to main.
# 13/ Check the states of replicas.
# 0/
data_directory = tempfile.TemporaryDirectory()
CONFIGURATION = {
"replica_1": {
"args": ["--bolt-port", "7688", "--log-level=TRACE"],
"log_file": "replica1.log",
"setup_queries": ["SET REPLICATION ROLE TO REPLICA WITH PORT 10001;"],
},
"replica_2": {
"args": ["--bolt-port", "7689", "--log-level=TRACE"],
"log_file": "replica2.log",
"setup_queries": ["SET REPLICATION ROLE TO REPLICA WITH PORT 10002;"],
},
"replica_3": {
"args": ["--bolt-port", "7690", "--log-level=TRACE"],
"log_file": "replica3.log",
"setup_queries": ["SET REPLICATION ROLE TO REPLICA WITH PORT 10003;"],
},
"replica_4": {
"args": ["--bolt-port", "7691", "--log-level=TRACE"],
"log_file": "replica4.log",
"setup_queries": ["SET REPLICATION ROLE TO REPLICA WITH PORT 10004;"],
},
"main": {
"args": ["--bolt-port", "7687", "--log-level=TRACE", "--storage-recover-on-startup=true"],
"log_file": "main.log",
"setup_queries": [],
"data_directory": f"{data_directory.name}",
},
}
interactive_mg_runner.start_all(CONFIGURATION)
cursor = connection(7687, "main").cursor()
# We want to execute manually and not via the configuration, otherwise re-starting main would also execute these registration.
execute_and_fetch_all(cursor, "REGISTER REPLICA replica_1 SYNC TO '127.0.0.1:10001';")
execute_and_fetch_all(cursor, "REGISTER REPLICA replica_2 SYNC TO '127.0.0.1:10002';")
execute_and_fetch_all(cursor, "REGISTER REPLICA replica_3 ASYNC TO '127.0.0.1:10003';")
execute_and_fetch_all(cursor, "REGISTER REPLICA replica_4 ASYNC TO '127.0.0.1:10004';")
# 1/
expected_data = {
("replica_1", "127.0.0.1:10001", "sync", 0, 0, "ready"),
("replica_2", "127.0.0.1:10002", "sync", 0, 0, "ready"),
("replica_3", "127.0.0.1:10003", "async", 0, 0, "ready"),
("replica_4", "127.0.0.1:10004", "async", 0, 0, "ready"),
}
actual_data = set(execute_and_fetch_all(cursor, "SHOW REPLICAS;"))
assert actual_data == expected_data
def check_roles():
assert "main" == interactive_mg_runner.MEMGRAPH_INSTANCES["main"].query("SHOW REPLICATION ROLE;")[0][0]
for index in range(1, 4):
assert (
"replica"
== interactive_mg_runner.MEMGRAPH_INSTANCES[f"replica_{index}"].query("SHOW REPLICATION ROLE;")[0][0]
)
check_roles()
# 2/
interactive_mg_runner.kill(CONFIGURATION, "main")
time.sleep(2)
# 3/
interactive_mg_runner.start(CONFIGURATION, "main")
cursor = connection(7687, "main").cursor()
check_roles()
# 4/
# We leave some time for the main to recover.
time.sleep(2)
actual_data = set(execute_and_fetch_all(cursor, "SHOW REPLICAS;"))
assert actual_data == expected_data
# 5/
execute_and_fetch_all(cursor, "DROP REPLICA replica_2;")
# 6/
execute_and_fetch_all(cursor, "CREATE (p1:Number {name:'Magic', value:42})")
interactive_mg_runner.kill(CONFIGURATION, "main")
time.sleep(2)
interactive_mg_runner.start(CONFIGURATION, "main")
cursor = connection(7687, "main").cursor()
check_roles()
# 7/
QUERY_TO_CHECK = "MATCH (node) return node;"
res_from_main = execute_and_fetch_all(cursor, QUERY_TO_CHECK)
assert len(res_from_main) == 1
for index in (1, 3, 4):
assert res_from_main == interactive_mg_runner.MEMGRAPH_INSTANCES[f"replica_{index}"].query(QUERY_TO_CHECK)
expected_data = {
("replica_1", "127.0.0.1:10001", "sync", 2, 0, "ready"),
("replica_3", "127.0.0.1:10003", "async", 2, 0, "ready"),
("replica_4", "127.0.0.1:10004", "async", 2, 0, "ready"),
}
actual_data = set(execute_and_fetch_all(cursor, "SHOW REPLICAS;"))
assert actual_data == expected_data
# Replica_2 was dropped, we check it does not have the data from main.
assert len(interactive_mg_runner.MEMGRAPH_INSTANCES["replica_2"].query(QUERY_TO_CHECK)) == 0
# 8/
interactive_mg_runner.kill(CONFIGURATION, "replica_3")
# 9/
execute_and_fetch_all(cursor, "CREATE (p1:Number {name:'Magic_again', value:43})")
res_from_main = execute_and_fetch_all(cursor, QUERY_TO_CHECK)
assert len(res_from_main) == 2
# 10/
execute_and_fetch_all(cursor, "REGISTER REPLICA replica_2 SYNC TO '127.0.0.1:10002';")
interactive_mg_runner.start(CONFIGURATION, "replica_3")
time.sleep(2)
expected_data = {
("replica_1", "127.0.0.1:10001", "sync", 6, 0, "ready"),
("replica_2", "127.0.0.1:10002", "sync", 6, 0, "ready"),
("replica_3", "127.0.0.1:10003", "async", 6, 0, "ready"),
("replica_4", "127.0.0.1:10004", "async", 6, 0, "ready"),
}
actual_data = set(execute_and_fetch_all(cursor, "SHOW REPLICAS;"))
assert actual_data == expected_data
for index in (1, 2, 3, 4):
assert interactive_mg_runner.MEMGRAPH_INSTANCES[f"replica_{index}"].query(QUERY_TO_CHECK) == res_from_main
# 11/
interactive_mg_runner.kill(CONFIGURATION, "replica_1")
time.sleep(1)
expected_data = {
("replica_1", "127.0.0.1:10001", "sync", 0, 0, "invalid"),
("replica_2", "127.0.0.1:10002", "sync", 6, 0, "ready"),
("replica_3", "127.0.0.1:10003", "async", 6, 0, "ready"),
("replica_4", "127.0.0.1:10004", "async", 6, 0, "ready"),
}
actual_data = set(execute_and_fetch_all(cursor, "SHOW REPLICAS;"))
assert actual_data == expected_data
# 12/
execute_and_fetch_all(cursor, "CREATE (p1:Number {name:'Magic_again_again', value:44})")
res_from_main = execute_and_fetch_all(cursor, QUERY_TO_CHECK)
assert len(res_from_main) == 3
for index in (2, 3, 4):
assert interactive_mg_runner.MEMGRAPH_INSTANCES[f"replica_{index}"].query(QUERY_TO_CHECK) == res_from_main
# 13/
expected_data = {
("replica_1", "127.0.0.1:10001", "sync", 0, 0, "invalid"),
("replica_2", "127.0.0.1:10002", "sync", 9, 0, "ready"),
("replica_3", "127.0.0.1:10003", "async", 9, 0, "ready"),
("replica_4", "127.0.0.1:10004", "async", 9, 0, "ready"),
}
actual_data = set(execute_and_fetch_all(cursor, "SHOW REPLICAS;"))
assert actual_data == expected_data
def test_conflict_at_startup(connection):
# Goal of this test is to check starting up several instance with different replicas' configuration directory works as expected.
# main_1 and main_2 have different directory.
data_directory1 = tempfile.TemporaryDirectory()
data_directory2 = tempfile.TemporaryDirectory()
CONFIGURATION = {
"main_1": {
"args": ["--bolt-port", "7687", "--log-level=TRACE"],
"log_file": "main1.log",
"setup_queries": [],
"data_directory": f"{data_directory1.name}",
},
"main_2": {
"args": ["--bolt-port", "7688", "--log-level=TRACE"],
"log_file": "main2.log",
"setup_queries": [],
"data_directory": f"{data_directory2.name}",
},
}
interactive_mg_runner.start_all(CONFIGURATION)
cursor_1 = connection(7687, "main_1").cursor()
cursor_2 = connection(7688, "main_2").cursor()
assert execute_and_fetch_all(cursor_1, "SHOW REPLICATION ROLE;")[0][0] == "main"
assert execute_and_fetch_all(cursor_2, "SHOW REPLICATION ROLE;")[0][0] == "main"
def test_basic_recovery_when_replica_is_kill_when_main_is_down(connection):
# Goal of this test is to check the recovery of main.
# 0/ We start all replicas manually: we want to be able to kill them ourselves without relying on external tooling to kill processes.
# 1/ We check that all replicas have the correct state: they should all be ready.
# 2/ We kill main then kill a replica.
# 3/ We re-start main: it should be able to restart.
# 4/ Check status of replica: replica_2 is invalid.
data_directory = tempfile.TemporaryDirectory()
CONFIGURATION = {
"replica_1": {
"args": ["--bolt-port", "7688", "--log-level=TRACE"],
"log_file": "replica1.log",
"setup_queries": ["SET REPLICATION ROLE TO REPLICA WITH PORT 10001;"],
},
"replica_2": {
"args": ["--bolt-port", "7689", "--log-level=TRACE"],
"log_file": "replica2.log",
"setup_queries": ["SET REPLICATION ROLE TO REPLICA WITH PORT 10002;"],
},
"main": {
"args": ["--bolt-port", "7687", "--log-level=TRACE", "--storage-recover-on-startup=true"],
"log_file": "main.log",
"setup_queries": [],
"data_directory": f"{data_directory.name}",
},
}
interactive_mg_runner.start_all(CONFIGURATION)
# We want to execute manually and not via the configuration, otherwise re-starting main would also execute these registration.
interactive_mg_runner.MEMGRAPH_INSTANCES["main"].query("REGISTER REPLICA replica_1 SYNC TO '127.0.0.1:10001';")
interactive_mg_runner.MEMGRAPH_INSTANCES["main"].query("REGISTER REPLICA replica_2 SYNC TO '127.0.0.1:10002';")
# 1/
expected_data = {
("replica_1", "127.0.0.1:10001", "sync", 0, 0, "ready"),
("replica_2", "127.0.0.1:10002", "sync", 0, 0, "ready"),
}
actual_data = set(interactive_mg_runner.MEMGRAPH_INSTANCES["main"].query("SHOW REPLICAS;"))
assert actual_data == expected_data
def check_roles():
assert "main" == interactive_mg_runner.MEMGRAPH_INSTANCES["main"].query("SHOW REPLICATION ROLE;")[0][0]
for index in range(1, 2):
assert (
"replica"
== interactive_mg_runner.MEMGRAPH_INSTANCES[f"replica_{index}"].query("SHOW REPLICATION ROLE;")[0][0]
)
check_roles()
# 2/
interactive_mg_runner.kill(CONFIGURATION, "main")
interactive_mg_runner.kill(CONFIGURATION, "replica_2")
time.sleep(2)
# 3/
interactive_mg_runner.start(CONFIGURATION, "main")
# 4/
expected_data = {
("replica_1", "127.0.0.1:10001", "sync", 0, 0, "ready"),
("replica_2", "127.0.0.1:10002", "sync", 0, 0, "invalid"),
}
actual_data = set(interactive_mg_runner.MEMGRAPH_INSTANCES["main"].query("SHOW REPLICAS;"))
assert actual_data == expected_data
if __name__ == "__main__":
sys.exit(pytest.main([__file__, "-rA"]))