Improve e2e and replication testing setup (#1061)

* Add `--replication-restore-state-on-startup` with `false` as default

Co-authored-by: Aidar Samerkhanov <aidar.samerkhanov@memgraph.io>
Co-authored-by: Andi Skrgat <andi8647@gmail.com>
This commit is contained in:
Marko Budiselić 2023-07-19 21:18:43 +02:00 committed by GitHub
parent 9d056e7649
commit 3b9133fd5a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
22 changed files with 376 additions and 182 deletions

View File

@ -266,12 +266,11 @@ jobs:
- name: Run e2e tests - name: Run e2e tests
run: | run: |
# TODO(gitbuda): Setup mgclient and pymgclient properly.
cd tests cd tests
./setup.sh ./setup.sh
source ve3/bin/activate source ve3/bin/activate
cd e2e cd e2e
LD_LIBRARY_PATH=$LD_LIBRARY_PATH:../../libs/mgclient/lib python runner.py --workloads-root-directory . ./run.sh
- name: Run stress test (plain) - name: Run stress test (plain)
run: | run: |
@ -293,7 +292,6 @@ jobs:
run: | run: |
# Activate toolchain. # Activate toolchain.
source /opt/toolchain-v4/activate source /opt/toolchain-v4/activate
cd build cd build
# create mgconsole # create mgconsole
@ -340,10 +338,8 @@ jobs:
run: | run: |
# Activate toolchain. # Activate toolchain.
source /opt/toolchain-v4/activate source /opt/toolchain-v4/activate
# Initialize dependencies. # Initialize dependencies.
./init ./init
# Build only memgraph release binarie. # Build only memgraph release binarie.
cd build cd build
cmake -DCMAKE_BUILD_TYPE=release .. cmake -DCMAKE_BUILD_TYPE=release ..
@ -352,7 +348,7 @@ jobs:
- name: Run Jepsen tests - name: Run Jepsen tests
run: | run: |
cd tests/jepsen cd tests/jepsen
./run.sh test --binary ../../build/memgraph --run-args "test-all --node-configs resources/node-config.edn" --ignore-run-stdout-logs --ignore-run-stderr-logs ./run.sh test-all-individually --binary ../../build/memgraph --ignore-run-stdout-logs --ignore-run-stderr-logs
- name: Save Jepsen report - name: Save Jepsen report
uses: actions/upload-artifact@v3 uses: actions/upload-artifact@v3

View File

@ -265,12 +265,11 @@ jobs:
- name: Run e2e tests - name: Run e2e tests
run: | run: |
# TODO(gitbuda): Setup mgclient and pymgclient properly.
cd tests cd tests
./setup.sh ./setup.sh
source ve3/bin/activate source ve3/bin/activate
cd e2e cd e2e
LD_LIBRARY_PATH=$LD_LIBRARY_PATH:../../libs/mgclient/lib python runner.py --workloads-root-directory . ./run.sh
- name: Run stress test (plain) - name: Run stress test (plain)
run: | run: |

View File

@ -264,12 +264,11 @@ jobs:
- name: Run e2e tests - name: Run e2e tests
run: | run: |
# TODO(gitbuda): Setup mgclient and pymgclient properly.
cd tests cd tests
./setup.sh ./setup.sh
source ve3/bin/activate source ve3/bin/activate
cd e2e cd e2e
LD_LIBRARY_PATH=$LD_LIBRARY_PATH:../../libs/mgclient/lib python runner.py --workloads-root-directory . ./run.sh
- name: Run stress test (plain) - name: Run stress test (plain)
run: | run: |
@ -319,10 +318,8 @@ jobs:
run: | run: |
# Activate toolchain. # Activate toolchain.
source /opt/toolchain-v4/activate source /opt/toolchain-v4/activate
# Initialize dependencies. # Initialize dependencies.
./init ./init
# Build only memgraph release binary. # Build only memgraph release binary.
cd build cd build
cmake -DCMAKE_BUILD_TYPE=release .. cmake -DCMAKE_BUILD_TYPE=release ..
@ -331,7 +328,7 @@ jobs:
- name: Run Jepsen tests - name: Run Jepsen tests
run: | run: |
cd tests/jepsen cd tests/jepsen
./run.sh test --binary ../../build/memgraph --run-args "test-all --node-configs resources/node-config.edn" --ignore-run-stdout-logs --ignore-run-stderr-logs ./run.sh test-all-individually --binary ../../build/memgraph --ignore-run-stdout-logs --ignore-run-stderr-logs
- name: Save Jepsen report - name: Save Jepsen report
uses: actions/upload-artifact@v3 uses: actions/upload-artifact@v3

View File

@ -264,12 +264,11 @@ jobs:
- name: Run e2e tests - name: Run e2e tests
run: | run: |
# TODO(gitbuda): Setup mgclient and pymgclient properly.
cd tests cd tests
./setup.sh ./setup.sh
source ve3/bin/activate source ve3/bin/activate
cd e2e cd e2e
LD_LIBRARY_PATH=$LD_LIBRARY_PATH:../../libs/mgclient/lib python runner.py --workloads-root-directory . ./run.sh
- name: Run stress test (plain) - name: Run stress test (plain)
run: | run: |

View File

@ -261,6 +261,8 @@ DEFINE_double(query_execution_timeout_sec, 600,
DEFINE_uint64(replication_replica_check_frequency_sec, 1, DEFINE_uint64(replication_replica_check_frequency_sec, 1,
"The time duration between two replica checks/pings. If < 1, replicas will NOT be checked at all. NOTE: " "The time duration between two replica checks/pings. If < 1, replicas will NOT be checked at all. NOTE: "
"The MAIN instance allocates a new thread for each REPLICA."); "The MAIN instance allocates a new thread for each REPLICA.");
// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
DEFINE_bool(replication_restore_state_on_startup, false, "Restore replication state on startup, e.g. recover replica");
// NOLINTNEXTLINE (cppcoreguidelines-avoid-non-const-global-variables) // NOLINTNEXTLINE (cppcoreguidelines-avoid-non-const-global-variables)
DEFINE_uint64( DEFINE_uint64(
@ -891,7 +893,7 @@ int main(int argc, char **argv) {
.wal_file_size_kibibytes = FLAGS_storage_wal_file_size_kib, .wal_file_size_kibibytes = FLAGS_storage_wal_file_size_kib,
.wal_file_flush_every_n_tx = FLAGS_storage_wal_file_flush_every_n_tx, .wal_file_flush_every_n_tx = FLAGS_storage_wal_file_flush_every_n_tx,
.snapshot_on_exit = FLAGS_storage_snapshot_on_exit, .snapshot_on_exit = FLAGS_storage_snapshot_on_exit,
.restore_replication_state_on_startup = true, .restore_replication_state_on_startup = FLAGS_replication_restore_state_on_startup,
.items_per_batch = FLAGS_storage_items_per_batch, .items_per_batch = FLAGS_storage_items_per_batch,
.recovery_thread_count = FLAGS_storage_recovery_thread_count, .recovery_thread_count = FLAGS_storage_recovery_thread_count,
.allow_parallel_index_creation = FLAGS_storage_parallel_index_recovery}, .allow_parallel_index_creation = FLAGS_storage_parallel_index_recovery},

View File

@ -162,7 +162,7 @@ InMemoryStorage::InMemoryStorage(Config config)
} }
} else { } else {
spdlog::warn( spdlog::warn(
"Replicastion configuration will NOT be stored. When the server restarts, replication state will be " "Replication configuration will NOT be stored. When the server restarts, replication state will be "
"forgotten."); "forgotten.");
} }

View File

@ -350,7 +350,7 @@ uint64_t InMemoryStorage::ReplicationClient::ReplicateCurrentWal() {
/// transactions while Snapshots contain all the data. For that reason we prefer /// transactions while Snapshots contain all the data. For that reason we prefer
/// WALs as much as possible. As the WAL file that is currently being updated /// WALs as much as possible. As the WAL file that is currently being updated
/// can change during the process we ignore it as much as possible. Also, it /// can change during the process we ignore it as much as possible. Also, it
/// uses the transaction lock so lokcing it can be really expensive. After we /// uses the transaction lock so locking it can be really expensive. After we
/// fetch the list of finalized WALs, we try to find the longest chain of /// fetch the list of finalized WALs, we try to find the longest chain of
/// sequential WALs, starting from the latest one, that will update the recovery /// sequential WALs, starting from the latest one, that will update the recovery
/// with the all missed updates. If the WAL chain cannot be created, replica is /// with the all missed updates. If the WAL chain cannot be created, replica is

13
tests/e2e/README.md Normal file
View File

@ -0,0 +1,13 @@
# tests/e2e
Framework to run end-to-end tests against Memgraph.
## Notes
* If you change something under this directory and below (even a Python
script), `make` has to be run again because all tests are copied to the build
directory and executed from there.
* Use/extend `run.sh` if you run any e2e tests:
* if all tests have to executed, use `run.sh`
* if a suite of tests have to be execute, take a look under `run.sh` how to do so
* if only a single test have to be execute, take a look at each individual binary/script, it's possible to manually pick the test

View File

@ -187,4 +187,9 @@ startup_config_dict = {
"Path to cypherl file that is used for configuring users and database schema before server starts.", "Path to cypherl file that is used for configuring users and database schema before server starts.",
), ),
"init_data_file": ("", "", "Path to cypherl file that is used for creating data after server starts."), "init_data_file": ("", "", "Path to cypherl file that is used for creating data after server starts."),
"replication_restore_state_on_startup": (
"false",
"false",
"Restore replication state on startup, e.g. recover replica",
),
} }

View File

@ -33,13 +33,11 @@
import atexit import atexit
import logging import logging
import os import os
import subprocess
import sys import sys
import tempfile import tempfile
import time import time
from argparse import ArgumentParser from argparse import ArgumentParser
from inspect import signature from inspect import signature
from pathlib import Path
import yaml import yaml
@ -77,9 +75,9 @@ ACTIONS = {
"info": lambda context: info(context), "info": lambda context: info(context),
"stop": lambda context, name: stop(context, name), "stop": lambda context, name: stop(context, name),
"start": lambda context, name: start(context, name), "start": lambda context, name: start(context, name),
"sleep": lambda context, delta: time.sleep(float(delta)), "sleep": lambda _, delta: time.sleep(float(delta)),
"exit": lambda context: sys.exit(1), "exit": lambda _: sys.exit(1),
"quit": lambda context: sys.exit(1), "quit": lambda _: sys.exit(1),
} }
log = logging.getLogger("memgraph.tests.e2e") log = logging.getLogger("memgraph.tests.e2e")

View File

@ -13,7 +13,6 @@ import copy
import os import os
import subprocess import subprocess
import sys import sys
import tempfile
import time import time
import mgclient import mgclient

View File

@ -147,27 +147,33 @@ def test_basic_recovery(connection):
data_directory = tempfile.TemporaryDirectory() data_directory = tempfile.TemporaryDirectory()
CONFIGURATION = { CONFIGURATION = {
"replica_1": { "replica_1": {
"args": ["--bolt-port", "7688", "--log-level=TRACE"], "args": ["--bolt-port", "7688", "--log-level=TRACE", "--replication-restore-state-on-startup=true"],
"log_file": "replica1.log", "log_file": "replica1.log",
"setup_queries": ["SET REPLICATION ROLE TO REPLICA WITH PORT 10001;"], "setup_queries": ["SET REPLICATION ROLE TO REPLICA WITH PORT 10001;"],
}, },
"replica_2": { "replica_2": {
"args": ["--bolt-port", "7689", "--log-level=TRACE"], "args": ["--bolt-port", "7689", "--log-level=TRACE", "--replication-restore-state-on-startup=true"],
"log_file": "replica2.log", "log_file": "replica2.log",
"setup_queries": ["SET REPLICATION ROLE TO REPLICA WITH PORT 10002;"], "setup_queries": ["SET REPLICATION ROLE TO REPLICA WITH PORT 10002;"],
}, },
"replica_3": { "replica_3": {
"args": ["--bolt-port", "7690", "--log-level=TRACE"], "args": ["--bolt-port", "7690", "--log-level=TRACE", "--replication-restore-state-on-startup=true"],
"log_file": "replica3.log", "log_file": "replica3.log",
"setup_queries": ["SET REPLICATION ROLE TO REPLICA WITH PORT 10003;"], "setup_queries": ["SET REPLICATION ROLE TO REPLICA WITH PORT 10003;"],
}, },
"replica_4": { "replica_4": {
"args": ["--bolt-port", "7691", "--log-level=TRACE"], "args": ["--bolt-port", "7691", "--log-level=TRACE", "--replication-restore-state-on-startup=true"],
"log_file": "replica4.log", "log_file": "replica4.log",
"setup_queries": ["SET REPLICATION ROLE TO REPLICA WITH PORT 10004;"], "setup_queries": ["SET REPLICATION ROLE TO REPLICA WITH PORT 10004;"],
}, },
"main": { "main": {
"args": ["--bolt-port", "7687", "--log-level=TRACE", "--storage-recover-on-startup=true"], "args": [
"--bolt-port",
"7687",
"--log-level=TRACE",
"--storage-recover-on-startup=true",
"--replication-restore-state-on-startup=true",
],
"log_file": "main.log", "log_file": "main.log",
"setup_queries": [], "setup_queries": [],
"data_directory": f"{data_directory.name}", "data_directory": f"{data_directory.name}",
@ -359,13 +365,19 @@ def test_replication_role_recovery(connection):
data_directory = tempfile.TemporaryDirectory() data_directory = tempfile.TemporaryDirectory()
CONFIGURATION = { CONFIGURATION = {
"replica": { "replica": {
"args": ["--bolt-port", "7688", "--log-level=TRACE"], "args": ["--bolt-port", "7688", "--log-level=TRACE", "--replication-restore-state-on-startup=true"],
"log_file": "replica.log", "log_file": "replica.log",
"setup_queries": ["SET REPLICATION ROLE TO REPLICA WITH PORT 10001;"], "setup_queries": ["SET REPLICATION ROLE TO REPLICA WITH PORT 10001;"],
"data_directory": f"{data_directory.name}/replica", "data_directory": f"{data_directory.name}/replica",
}, },
"main": { "main": {
"args": ["--bolt-port", "7687", "--log-level=TRACE", "--storage-recover-on-startup=true"], "args": [
"--bolt-port",
"7687",
"--log-level=TRACE",
"--storage-recover-on-startup=true",
"--replication-restore-state-on-startup=true",
],
"log_file": "main.log", "log_file": "main.log",
"setup_queries": [], "setup_queries": [],
"data_directory": f"{data_directory.name}/main", "data_directory": f"{data_directory.name}/main",
@ -381,13 +393,19 @@ def test_replication_role_recovery(connection):
# When we restart the replica, it does not need this query anymore since it needs to remember state # When we restart the replica, it does not need this query anymore since it needs to remember state
CONFIGURATION = { CONFIGURATION = {
"replica": { "replica": {
"args": ["--bolt-port", "7688", "--log-level=TRACE"], "args": ["--bolt-port", "7688", "--log-level=TRACE", "--replication-restore-state-on-startup=true"],
"log_file": "replica.log", "log_file": "replica.log",
"setup_queries": [], "setup_queries": [],
"data_directory": f"{data_directory.name}/replica", "data_directory": f"{data_directory.name}/replica",
}, },
"main": { "main": {
"args": ["--bolt-port", "7687", "--log-level=TRACE", "--storage-recover-on-startup=true"], "args": [
"--bolt-port",
"7687",
"--log-level=TRACE",
"--storage-recover-on-startup=true",
"--replication-restore-state-on-startup=true",
],
"log_file": "main.log", "log_file": "main.log",
"setup_queries": [], "setup_queries": [],
"data_directory": f"{data_directory.name}/main", "data_directory": f"{data_directory.name}/main",
@ -511,17 +529,23 @@ def test_basic_recovery_when_replica_is_kill_when_main_is_down():
data_directory = tempfile.TemporaryDirectory() data_directory = tempfile.TemporaryDirectory()
CONFIGURATION = { CONFIGURATION = {
"replica_1": { "replica_1": {
"args": ["--bolt-port", "7688", "--log-level=TRACE"], "args": ["--bolt-port", "7688", "--log-level=TRACE", "--replication-restore-state-on-startup=true"],
"log_file": "replica1.log", "log_file": "replica1.log",
"setup_queries": ["SET REPLICATION ROLE TO REPLICA WITH PORT 10001;"], "setup_queries": ["SET REPLICATION ROLE TO REPLICA WITH PORT 10001;"],
}, },
"replica_2": { "replica_2": {
"args": ["--bolt-port", "7689", "--log-level=TRACE"], "args": ["--bolt-port", "7689", "--log-level=TRACE", "--replication-restore-state-on-startup=true"],
"log_file": "replica2.log", "log_file": "replica2.log",
"setup_queries": ["SET REPLICATION ROLE TO REPLICA WITH PORT 10002;"], "setup_queries": ["SET REPLICATION ROLE TO REPLICA WITH PORT 10002;"],
}, },
"main": { "main": {
"args": ["--bolt-port", "7687", "--log-level=TRACE", "--storage-recover-on-startup=true"], "args": [
"--bolt-port",
"7687",
"--log-level=TRACE",
"--storage-recover-on-startup=true",
"--replication-restore-state-on-startup=true",
],
"log_file": "main.log", "log_file": "main.log",
"setup_queries": [], "setup_queries": [],
"data_directory": f"{data_directory.name}", "data_directory": f"{data_directory.name}",

37
tests/e2e/run.sh Executable file
View File

@ -0,0 +1,37 @@
#!/bin/bash
# TODO(gitbuda): Setup mgclient and pymgclient properly.
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:../../libs/mgclient/lib
print_help() {
echo -e "$0 ["workload name string"]"
echo -e ""
echo -e " NOTE: some tests require enterprise licence key,"
echo -e " to run those define the folowing env vars:"
echo -e " * MEMGRAPH_ORGANIZATION_NAME"
echo -e " * MEMGRAPH_ENTERPRISE_LICENSE"
exit 1
}
check_license() {
if [ ! -v MEMGRAPH_ORGANIZATION_NAME ] || [ ! -v MEMGRAPH_ENTERPRISE_LICENSE ]; then
echo "NOTE: MEMGRAPH_ORGANIZATION_NAME or MEMGRAPH_ENTERPRISE_LICENSE NOT defined -> dependent tests will NOT work"
fi
}
if [ "$#" -eq 0 ]; then
check_license
# NOTE: If you want to run all tests under specific folder/section just
# replace the dot (root directory below) with the folder name, e.g.
# `--workloads-root-directory replication`.
python3 runner.py --workloads-root-directory .
elif [ "$#" -eq 1 ]; then
if [ "$1" == "-h" ] || [ "$1" == "--help" ]; then
print_help
fi
check_license
# NOTE: --workload-name comes from each individual folder/section
# workloads.yaml file. E.g. `streams/workloads.yaml` has a list of
# `workloads:` and each workload has it's `-name`.
python3 runner.py --workloads-root-directory . --workload-name "$1"
else
print_help
fi

View File

@ -1,4 +0,0 @@
#!/bin/bash
# TODO: andi as a side project
python3 runner.py --workloads-root-directory disk_storage

View File

@ -5,14 +5,6 @@ test_transaction_queue: &test_transaction_queue
log_file: "transaction_queue.log" log_file: "transaction_queue.log"
setup_queries: [] setup_queries: []
validation_queries: [] validation_queries: []
disk_test_transaction_queue: &disk_test_transaction_queue
cluster:
main:
args: ["--bolt-port", "7687", "--log-level=TRACE", "--also-log-to-stderr"]
log_file: "transaction_queue.log"
setup_queries: ["STORAGE MODE ON_DISK_TRANSACTIONAL"]
validation_queries: []
workloads: workloads:
- name: "test-transaction-queue" # should be the same as the python file - name: "test-transaction-queue" # should be the same as the python file
@ -20,8 +12,3 @@ workloads:
proc: "tests/e2e/transaction_queue/procedures/" proc: "tests/e2e/transaction_queue/procedures/"
args: ["transaction_queue/test_transaction_queue.py"] args: ["transaction_queue/test_transaction_queue.py"]
<<: *test_transaction_queue <<: *test_transaction_queue
- name: "test-transaction-queue on disk" # should be the same as the python file
binary: "tests/e2e/pytest_runner.sh"
proc: "tests/e2e/transaction_queue/procedures/"
args: ["transaction_queue/test_transaction_queue.py"]
<<: *disk_test_transaction_queue

View File

@ -2,3 +2,31 @@
NOTE: Jepsen can only connect to the SSH server on the default 22 port. NOTE: Jepsen can only connect to the SSH server on the default 22 port.
`--node` flag only takes the actual address (:port doesn't work). `--node` flag only takes the actual address (:port doesn't work).
Jepsen run under CI:
```
cd tests/jepsen
./run.sh test --binary ../../build/memgraph --run-args "test-all --node-configs resources/node-config.edn" --ignore-run-stdout-logs --ignore-run-stderr-logs
```
Local run of each test (including setup):
```
cd tests/jepsen
./run.sh cluster-up
docker exec -it jepsen-control bash
cd memgraph
lein run test --workload bank --node-configs resources/node-config.edn
lein run test --workload large --node-configs resources/node-config.edn
```
Logs are located under `jepsen-control:/jepsen/memgraph/store`.
If you setup cluster manually go to jepsen-control Docker container and ssh to all cluster nodes to save their host keys in known_hosts.
```
docker exec -it jepsen-control bash
ssh n1 -> yes -> exit
ssh n2 -> yes -> exit
ssh n3 -> yes -> exit
ssh n4 -> yes -> exit
ssh n5 -> yes -> exit
```

View File

@ -0,0 +1,13 @@
diff --git a/docker/control/Dockerfile b/docker/control/Dockerfile
index 6b2d3c0e..195a7a60 100644
--- a/docker/control/Dockerfile
+++ b/docker/control/Dockerfile
@@ -7,7 +7,7 @@ ENV LEIN_ROOT true
# Jepsen dependencies
#
RUN apt-get -y -q update && \
- apt-get install -qy openjdk-17-jdk-headless \
+ apt-get install -qy ca-certificates-java openjdk-17-jdk-headless \
libjna-java \
vim \
emacs \

View File

@ -5,7 +5,10 @@
:url "https://github.com/memgraph/memgraph/blob/master/release/LICENSE_ENTERPRISE.md"} :url "https://github.com/memgraph/memgraph/blob/master/release/LICENSE_ENTERPRISE.md"}
:main jepsen.memgraph.core :main jepsen.memgraph.core
:dependencies [[org.clojure/clojure "1.10.0"] :dependencies [[org.clojure/clojure "1.10.0"]
[jepsen "0.2.1-SNAPSHOT"] ;; 0.2.4-SNAPSHOT but 0.3.0, for more -> https://clojars.org/jepsen/versions
[jepsen "0.2.4-SNAPSHOT"]
[gorillalabs/neo4j-clj "4.1.0"]] [gorillalabs/neo4j-clj "4.1.0"]]
:profiles {:test {:dependencies [#_[org.neo4j.test/neo4j-harness "4.1.0"]]}} :profiles {:test {:dependencies [#_[org.neo4j.test/neo4j-harness "4.1.0"]]}}
;; required to run 0.3.0
; :aot :all
:repl-options {:init-ns jepsen.memgraph.core}) :repl-options {:init-ns jepsen.memgraph.core})

View File

@ -1,16 +1,18 @@
#!/bin/bash #!/bin/bash
set -Eeuo pipefail set -Eeuo pipefail
script_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" script_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
MEMGRAPH_BINARY_PATH="../../build/memgraph" MEMGRAPH_BINARY_PATH="../../build/memgraph"
# NOTE: On Ubuntu 22.04 0.3.2 uses non-existing docker compose --compatibility flag. # NOTE: Jepsen Git tags are not consistent, there are: 0.2.4, v0.3.0, 0.3.2, ...
# NOTE: On Ubuntu 22.04 0.3.1 seems to be working. # NOTE: On Ubuntu 22.04 v0.3.2 uses non-existing docker compose --compatibility flag.
JEPSEN_VERSION="${JEPSEN_VERSION:-v0.3.0}" # NOTE: On Ubuntu 22.04 v0.3.0 and v0.3.1 seems to be runnable.
# TODO(gitbuda): Make sure Memgraph can be testes with Jepsen >= 0.3.0
JEPSEN_VERSION="${JEPSEN_VERSION:-0.2.4}"
JEPSEN_ACTIVE_NODES_NO=5 JEPSEN_ACTIVE_NODES_NO=5
CONTROL_LEIN_RUN_ARGS="test-all --node-configs resources/node-config.edn" CONTROL_LEIN_RUN_ARGS="test-all --node-configs resources/node-config.edn"
CONTROL_LEIN_RUN_STDOUT_LOGS=1 CONTROL_LEIN_RUN_STDOUT_LOGS=1
CONTROL_LEIN_RUN_STDERR_LOGS=1 CONTROL_LEIN_RUN_STDERR_LOGS=1
_JEPSEN_RUN_EXIT_STATUS=0
PRINT_CONTEXT() { PRINT_CONTEXT() {
echo -e "MEMGRAPH_BINARY_PATH:\t\t $MEMGRAPH_BINARY_PATH" echo -e "MEMGRAPH_BINARY_PATH:\t\t $MEMGRAPH_BINARY_PATH"
echo -e "JEPSEN_VERSION:\t\t\t $JEPSEN_VERSION" echo -e "JEPSEN_VERSION:\t\t\t $JEPSEN_VERSION"
@ -22,7 +24,7 @@ PRINT_CONTEXT() {
HELP_EXIT() { HELP_EXIT() {
echo "" echo ""
echo "HELP: $0 help|cluster-up|test [args]" echo "HELP: $0 help|cluster-up|cluster-cleanup|cluster-dealloc|mgbuild|test|test-all-individually [args]"
echo "" echo ""
echo " test args --binary MEMGRAPH_BINARY_PATH" echo " test args --binary MEMGRAPH_BINARY_PATH"
echo " --ignore-run-stdout-logs Ignore lein run stdout logs." echo " --ignore-run-stdout-logs Ignore lein run stdout logs."
@ -45,153 +47,235 @@ if ! command -v docker > /dev/null 2>&1 || ! command -v docker-compose > /dev/nu
ERROR "docker and docker-compose have to be installed." ERROR "docker and docker-compose have to be installed."
exit 1 exit 1
fi fi
PRINT_CONTEXT
if [ ! -d "$script_dir/jepsen" ]; then if [ ! -d "$script_dir/jepsen" ]; then
git clone https://github.com/jepsen-io/jepsen.git -b "$JEPSEN_VERSION" "$script_dir/jepsen" git clone https://github.com/jepsen-io/jepsen.git -b "$JEPSEN_VERSION" "$script_dir/jepsen"
if [ "$JEPSEN_VERSION" == "v0.3.0" ]; then
if [ -f "$script_dir/jepsen_0.3.0.patch" ]; then
cd "$script_dir/jepsen"
git apply "$script_dir/jepsen_0.3.0.patch"
cd "$script_dir"
fi
fi
fi fi
if [ "$#" -lt 1 ]; then if [ "$#" -lt 1 ]; then
HELP_EXIT HELP_EXIT
fi fi
PROCESS_ARGS() {
shift
while [[ $# -gt 0 ]]; do
key="$1"
case $key in
--binary)
shift
MEMGRAPH_BINARY_PATH="$1"
shift
;;
--ignore-run-stdout-logs)
CONTROL_LEIN_RUN_STDOUT_LOGS=0
shift
;;
--ignore-run-stderr-logs)
CONTROL_LEIN_RUN_STDERR_LOGS=0
shift
;;
--nodes-no)
shift
JEPSEN_ACTIVE_NODES_NO="$1"
shift
;;
--run-args)
shift
CONTROL_LEIN_RUN_ARGS="$1"
shift
;;
*)
ERROR "Unknown option $1."
HELP_EXIT
;;
esac
done
}
COPY_BINARIES() {
# Copy Memgraph binary, handles both cases, when binary is a sym link
# or a regular file.
binary_path="$MEMGRAPH_BINARY_PATH"
if [ -L "$binary_path" ]; then
binary_path=$(readlink "$binary_path")
fi
binary_name=$(basename -- "$binary_path")
for iter in $(seq 1 "$JEPSEN_ACTIVE_NODES_NO"); do
jepsen_node_name="jepsen-n$iter"
docker_exec="docker exec $jepsen_node_name bash -c"
if [ "$binary_name" == "memgraph" ]; then
_binary_name="memgraph_tmp"
else
_binary_name="$binary_name"
fi
$docker_exec "rm -rf /opt/memgraph/ && mkdir -p /opt/memgraph"
docker cp "$binary_path" "$jepsen_node_name":/opt/memgraph/"$_binary_name"
$docker_exec "ln -s /opt/memgraph/$_binary_name /opt/memgraph/memgraph"
$docker_exec "touch /opt/memgraph/memgraph.log"
INFO "Copying $binary_name to $jepsen_node_name DONE."
done
# Copy test files into the control node.
docker exec jepsen-control mkdir -p /jepsen/memgraph/store
docker cp "$script_dir/src/." jepsen-control:/jepsen/memgraph/src/
docker cp "$script_dir/test/." jepsen-control:/jepsen/memgraph/test/
docker cp "$script_dir/resources/." jepsen-control:/jepsen/memgraph/resources/
docker cp "$script_dir/project.clj" jepsen-control:/jepsen/memgraph/project.clj
INFO "Copying test files to jepsen-control DONE."
}
RUN_JEPSEN() {
__control_lein_run_args="$1"
# NOTE: docker exec -t is NOT ok because gh CI user does NOT have TTY.
# NOTE: ~/.bashrc has to be manually sourced when bash -c is used
# because some Jepsen config is there.
# To be able to archive the run result even if the run fails.
set +e
if [ "$CONTROL_LEIN_RUN_STDOUT_LOGS" -eq 0 ]; then
redirect_stdout_logs="/dev/null"
else
redirect_stdout_logs="/dev/stdout"
fi
if [ "$CONTROL_LEIN_RUN_STDERR_LOGS" -eq 0 ]; then
redirect_stderr_logs="/dev/null"
else
redirect_stderr_logs="/dev/stderr"
fi
docker exec jepsen-control bash -c "source ~/.bashrc && cd memgraph && lein run $__control_lein_run_args" 1> $redirect_stdout_logs 2> $redirect_stderr_logs
_JEPSEN_RUN_EXIT_STATUS=$?
set -e
}
PROCESS_RESULTS() {
start_time="$1"
end_time="$2"
INFO "Process results..."
# Print and pack all test workload runs between start and end time.
all_workloads=$(docker exec jepsen-control bash -c 'ls /jepsen/memgraph/store/' | grep test-)
all_workload_run_folders=""
for workload in $all_workloads; do
for time_folder in $(docker exec jepsen-control bash -c "ls /jepsen/memgraph/store/$workload"); do
if [[ "$time_folder" == "latest" ]]; then
continue
fi
# The early continue pattern here is nice because bash doesn't
# have >= for the string comparison (marginal values).
if [[ "$time_folder" < "$start_time" ]]; then
continue
fi
if [[ "$time_folder" > "$end_time" ]]; then
continue
fi
INFO "jepsen.log for $workload/$time_folder"
docker exec jepsen-control bash -c "tail -n 50 /jepsen/memgraph/store/$workload/$time_folder/jepsen.log"
all_workload_run_folders="$all_workload_run_folders /jepsen/memgraph/store/$workload/$time_folder"
done
done
INFO "Packing results..."
docker exec jepsen-control bash -c "tar -czvf /jepsen/memgraph/Jepsen.tar.gz $all_workload_run_folders"
docker cp jepsen-control:/jepsen/memgraph/Jepsen.tar.gz ./
INFO "Result processing (printing and packing) DONE."
}
# Initialize testing context by copying source/binary files. Inside CI, # Initialize testing context by copying source/binary files. Inside CI,
# Memgraph is tested on a single machine cluster based on Docker containers. # Memgraph is tested on a single machine cluster based on Docker containers.
# Once these tests will be part of the official Jepsen repo, the majority of # Once these tests will be part of the official Jepsen repo, the majority of
# functionalities inside this script won't be needed because each node clones # functionalities inside this script won't be needed because each node clones
# the public repo. # the public repo.
case $1 in case $1 in
help)
HELP_EXIT
;;
# Start Jepsen Docker cluster of 5 nodes. To configure the cluster please # Start Jepsen Docker cluster of 5 nodes. To configure the cluster please
# take a look under jepsen/docker/docker-compose.yml. # take a look under jepsen/docker/docker-compose.yml.
# NOTE: If you delete the jepsen folder where docker config is located, # NOTE: If you delete the jepsen folder where docker config is located,
# the current cluster is broken because it relies on the folder. That can # the current cluster is broken because it relies on the folder. That can
# happen easiliy because the jepsen folder is git ignored. # happen easiliy because the jepsen folder is git ignored.
cluster-up) cluster-up)
PRINT_CONTEXT
"$script_dir/jepsen/docker/bin/up" --daemon "$script_dir/jepsen/docker/bin/up" --daemon
;; ;;
# Run tests against the specified Memgraph binary.
test)
shift
while [[ $# -gt 0 ]]; do
key="$1"
case $key in
--binary)
shift
MEMGRAPH_BINARY_PATH="$1"
shift
;;
--ignore-run-stdout-logs)
CONTROL_LEIN_RUN_STDOUT_LOGS=0
shift
;;
--ignore-run-stderr-logs)
CONTROL_LEIN_RUN_STDERR_LOGS=0
shift
;;
--nodes-no)
shift
JEPSEN_ACTIVE_NODES_NO="$1"
shift
;;
--run-args)
shift
CONTROL_LEIN_RUN_ARGS="$1"
shift
;;
*)
ERROR "Unknown option $1."
HELP_EXIT
;;
esac
done
# Copy Memgraph binary, handles both cases, when binary is a sym link cluster-cleanup)
# or a regular file. jepsen_control_exec="docker exec jepsen-control bash -c"
binary_path="$MEMGRAPH_BINARY_PATH" INFO "Deleting /jepsen/memgraph/store/* on jepsen-control"
if [ -L "$binary_path" ]; then $jepsen_control_exec "rm -rf /jepsen/memgraph/store/*"
binary_path=$(readlink "$binary_path")
fi
binary_name=$(basename -- "$binary_path")
for iter in $(seq 1 "$JEPSEN_ACTIVE_NODES_NO"); do for iter in $(seq 1 "$JEPSEN_ACTIVE_NODES_NO"); do
jepsen_node_name="jepsen-n$iter" jepsen_node_name="jepsen-n$iter"
docker_exec="docker exec $jepsen_node_name bash -c" jepsen_node_exec="docker exec $jepsen_node_name bash -c"
if [ "$binary_name" == "memgraph" ]; then INFO "Deleting /opt/memgraph/* on $jepsen_node_name"
_binary_name="memgraph_tmp" $jepsen_node_exec "rm -rf /opt/memgraph/*"
else
_binary_name="$binary_name"
fi
$docker_exec "rm -rf /opt/memgraph/ && mkdir -p /opt/memgraph"
docker cp "$binary_path" "$jepsen_node_name":/opt/memgraph/"$_binary_name"
$docker_exec "ln -s /opt/memgraph/$_binary_name /opt/memgraph/memgraph"
$docker_exec "touch /opt/memgraph/memgraph.log"
INFO "Copying $binary_name to $jepsen_node_name DONE."
done done
;;
# Copy test files into the control node. cluster-dealloc)
docker exec jepsen-control mkdir -p /jepsen/memgraph ps=$(docker ps --filter name=jepsen* --filter status=running -q)
docker cp "$script_dir/src/." jepsen-control:/jepsen/memgraph/src/ if [[ ! -z ${ps} ]]; then
docker cp "$script_dir/test/." jepsen-control:/jepsen/memgraph/test/ echo "Killing ${ps}"
docker cp "$script_dir/resources/." jepsen-control:/jepsen/memgraph/resources/ docker rm -f ${ps}
docker cp "$script_dir/project.clj" jepsen-control:/jepsen/memgraph/project.clj imgs=$(docker images "jepsen*" -q)
INFO "Copying test files to jepsen-control DONE." if [[ ! -z ${imgs} ]]; then
echo "Removing ${imgs}"
docker images "jepsen*" -q | xargs docker image rmi -f
else
echo "No Jepsen images detected!"
fi
else
echo "No Jepsen containers detected!"
fi
;;
mgbuild)
PRINT_CONTEXT
echo ""
echo "TODO(gitbuda): Build memgraph for Debian 10 via memgraph/memgraph-builder"
exit 1
;;
test)
PROCESS_ARGS "$@"
PRINT_CONTEXT
COPY_BINARIES
start_time="$(docker exec jepsen-control bash -c 'date -u +"%Y%m%dT%H%M%S"').000Z" start_time="$(docker exec jepsen-control bash -c 'date -u +"%Y%m%dT%H%M%S"').000Z"
# Run the test.
# NOTE: docker exec -t is NOT ok because gh CI user does NOT have TTY.
# NOTE: ~/.bashrc has to be manually sourced when bash -c is used
# because some Jepsen config is there.
set +e
if [ "$CONTROL_LEIN_RUN_STDOUT_LOGS" -eq 0 ]; then
redirect_stdout_logs="/dev/null"
else
redirect_stdout_logs="/dev/stdout"
fi
if [ "$CONTROL_LEIN_RUN_STDERR_LOGS" -eq 0 ]; then
redirect_stderr_logs="/dev/null"
else
redirect_stderr_logs="/dev/stderr"
fi
INFO "Jepsen run in progress... START_TIME: $start_time" INFO "Jepsen run in progress... START_TIME: $start_time"
docker exec jepsen-control bash -c "source ~/.bashrc && cd memgraph && lein run $CONTROL_LEIN_RUN_ARGS" 1> $redirect_stdout_logs 2> $redirect_stderr_logs RUN_JEPSEN "$CONTROL_LEIN_RUN_ARGS"
# To be able to archive the run result even if the run fails.
jepsen_run_exit_status=$?
end_time="$(docker exec jepsen-control bash -c 'date -u +"%Y%m%dT%H%M%S"').000Z" end_time="$(docker exec jepsen-control bash -c 'date -u +"%Y%m%dT%H%M%S"').000Z"
INFO "Jepsen run DONE. END_TIME: $end_time" INFO "Jepsen run DONE. END_TIME: $end_time"
set -e PROCESS_RESULTS "$start_time" "$end_time"
# Exit if the jepsen run status is not 0
# Pack all test workload runs between start and end time. if [ "$_JEPSEN_RUN_EXIT_STATUS" -ne 0 ]; then
all_workloads=$(docker exec jepsen-control bash -c 'ls /jepsen/memgraph/store/' | grep test-) ERROR "Jepsen FAILED" # important for the coder
all_workload_run_folders="" exit "$_JEPSEN_RUN_EXIT_STATUS" # important for CI
for workload in $all_workloads; do
for time_folder in $(docker exec jepsen-control bash -c "ls /jepsen/memgraph/store/$workload"); do
if [[ "$time_folder" == "latest" ]]; then
continue
fi
# The early continue pattern here is nice because bash doesn't
# have >= for the string comparison (marginal values).
if [[ "$time_folder" < "$start_time" ]]; then
continue
fi
if [[ "$time_folder" > "$end_time" ]]; then
continue
fi
all_workload_run_folders="$all_workload_run_folders /jepsen/memgraph/store/$workload/$time_folder"
done
done
docker exec jepsen-control bash -c "tar -czvf /jepsen/memgraph/Jepsen.tar.gz $all_workload_run_folders"
docker cp jepsen-control:/jepsen/memgraph/Jepsen.tar.gz ./
INFO "Test and results packing DONE."
# If the run has failed, this script also has to return non-zero status.
if [ "$jepsen_run_exit_status" -ne 0 ]; then
exit "$jepsen_run_exit_status"
fi fi
;; ;;
test-all-individually)
PROCESS_ARGS "$@"
PRINT_CONTEXT
INFO "NOTE: CONTROL_LEIN_RUN_ARGS ignored"
COPY_BINARIES
start_time="$(docker exec jepsen-control bash -c 'date -u +"%Y%m%dT%H%M%S"').000Z"
INFO "Jepsen run in progress... START_TIME: $start_time"
for workload in "bank" "large"; do
RUN_JEPSEN "test --workload $workload --node-configs resources/node-config.edn"
if [ "$_JEPSEN_RUN_EXIT_STATUS" -ne 0 ]; then
break
fi
done
end_time="$(docker exec jepsen-control bash -c 'date -u +"%Y%m%dT%H%M%S"').000Z"
INFO "Jepsen run DONE. END_TIME: $end_time"
PROCESS_RESULTS "$start_time" "$end_time"
# Exit if the jepsen run status is not 0
if [ "$_JEPSEN_RUN_EXIT_STATUS" -ne 0 ]; then
ERROR "Jepsen FAILED" # important for the coder
exit "$_JEPSEN_RUN_EXIT_STATUS" # important for CI
fi
;;
*) *)
HELP_EXIT HELP_EXIT
;; ;;
esac esac

View File

@ -146,9 +146,23 @@
["-w" "--workload NAME" "Test workload to run" ["-w" "--workload NAME" "Test workload to run"
:parse-fn keyword :parse-fn keyword
:validate [workloads (cli/one-of workloads)]] :validate [workloads (cli/one-of workloads)]]
[nil "--node-configs PATH" "Path to the node configuration file." [nil "--node-configs PATH" "Path to a file containing a list of node config."
:parse-fn #(-> % e/load-configuration)]]) :parse-fn #(-> % e/load-configuration)]])
(defn single-test
"Takes base CLI options and constructs a single test."
[opts]
(let [workload (if (:workload opts)
(:workload opts)
(throw (Exception. "Workload undefined")))
node-config (if (:node-configs opts)
(first (merge-node-configurations (:nodes opts) (list (first (:node-configs opts)))))
(throw (Exception. "Node configs undefined")))
test-opts (assoc opts
:node-config node-config
:workload workload)]
(memgraph-test test-opts)))
(defn all-tests (defn all-tests
"Takes base CLI options and constructs a sequence of test options." "Takes base CLI options and constructs a sequence of test options."
[opts] [opts]
@ -169,7 +183,7 @@
[& args] [& args]
(cli/run! (merge (cli/test-all-cmd {:tests-fn all-tests (cli/run! (merge (cli/test-all-cmd {:tests-fn all-tests
:opt-spec cli-opts}) :opt-spec cli-opts})
(cli/single-test-cmd {:test-fn memgraph-test (cli/single-test-cmd {:test-fn single-test
:opt-spec cli-opts}) :opt-spec cli-opts})
(cli/serve-cmd)) (cli/serve-cmd))
args)) args))

View File

@ -1,7 +1,7 @@
(ns jepsen.memgraph.nemesis (ns jepsen.memgraph.nemesis
"Memgraph nemesis" "Memgraph nemesis"
(:require [jepsen [nemesis :as nemesis] (:require [jepsen [nemesis :as nemesis]
[generator :as gen]] [generator :as gen]]
[jepsen.memgraph.support :as s])) [jepsen.memgraph.support :as s]))
(defn node-killer (defn node-killer
@ -16,10 +16,10 @@
"Can kill and restart all processess and initiate network partitions." "Can kill and restart all processess and initiate network partitions."
[opts] [opts]
(nemesis/compose (nemesis/compose
{{:kill-node :start {{:kill-node :start
:restart-node :stop} (node-killer) :restart-node :stop} (node-killer)
{:start-partition-halves :start {:start-partition-halves :start
:stop-partition-halves :stop} (nemesis/partition-random-halves)})) :stop-partition-halves :stop} (nemesis/partition-random-halves)}))
(defn op (defn op
"Construct a nemesis op" "Construct a nemesis op"
@ -36,7 +36,7 @@
(apply concat) (apply concat)
gen/mix gen/mix
(gen/stagger (:interval opts)) (gen/stagger (:interval opts))
(gen/phases (gen/sleep 10)))) (gen/phases (gen/sleep 60))))
(defn nemesis (defn nemesis
"Composite nemesis and generator" "Composite nemesis and generator"

View File

@ -2,8 +2,8 @@
(:require [clojure.string :as str] (:require [clojure.string :as str]
[clojure.tools.logging :refer [info]] [clojure.tools.logging :refer [info]]
[jepsen [db :as db] [jepsen [db :as db]
[control :as c] [control :as c]
[util :as util :refer [meh]]] [util :as util :refer [meh]]]
[jepsen.control.util :as cu] [jepsen.control.util :as cu]
[jepsen.os.debian :as debian])) [jepsen.os.debian :as debian]))
@ -44,7 +44,7 @@
(throw (Exception. (str local-binary " is not there."))))) (throw (Exception. (str local-binary " is not there.")))))
(info node "Memgraph binary is there" local-binary) (info node "Memgraph binary is there" local-binary)
(start-node! test node) (start-node! test node)
(Thread/sleep 2000))) (Thread/sleep 5000))) ;; TODO(gitbuda): The sleep after Jepsen starting Memgraph is for sure questionable.
(teardown! [_ test node] (teardown! [_ test node]
(info node "Tearing down Memgraph") (info node "Tearing down Memgraph")
(stop-node! test node) (stop-node! test node)