Improve e2e and replication testing setup (#1061)

* Add `--replication-restore-state-on-startup` with `false` as default

Co-authored-by: Aidar Samerkhanov <aidar.samerkhanov@memgraph.io>
Co-authored-by: Andi Skrgat <andi8647@gmail.com>
This commit is contained in:
Marko Budiselić 2023-07-19 21:18:43 +02:00 committed by GitHub
parent 9d056e7649
commit 3b9133fd5a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
22 changed files with 376 additions and 182 deletions

View File

@ -266,12 +266,11 @@ jobs:
- name: Run e2e tests
run: |
# TODO(gitbuda): Setup mgclient and pymgclient properly.
cd tests
./setup.sh
source ve3/bin/activate
cd e2e
LD_LIBRARY_PATH=$LD_LIBRARY_PATH:../../libs/mgclient/lib python runner.py --workloads-root-directory .
./run.sh
- name: Run stress test (plain)
run: |
@ -293,7 +292,6 @@ jobs:
run: |
# Activate toolchain.
source /opt/toolchain-v4/activate
cd build
# create mgconsole
@ -340,10 +338,8 @@ jobs:
run: |
# Activate toolchain.
source /opt/toolchain-v4/activate
# Initialize dependencies.
./init
# Build only memgraph release binarie.
cd build
cmake -DCMAKE_BUILD_TYPE=release ..
@ -352,7 +348,7 @@ jobs:
- name: Run Jepsen tests
run: |
cd tests/jepsen
./run.sh test --binary ../../build/memgraph --run-args "test-all --node-configs resources/node-config.edn" --ignore-run-stdout-logs --ignore-run-stderr-logs
./run.sh test-all-individually --binary ../../build/memgraph --ignore-run-stdout-logs --ignore-run-stderr-logs
- name: Save Jepsen report
uses: actions/upload-artifact@v3

View File

@ -265,12 +265,11 @@ jobs:
- name: Run e2e tests
run: |
# TODO(gitbuda): Setup mgclient and pymgclient properly.
cd tests
./setup.sh
source ve3/bin/activate
cd e2e
LD_LIBRARY_PATH=$LD_LIBRARY_PATH:../../libs/mgclient/lib python runner.py --workloads-root-directory .
./run.sh
- name: Run stress test (plain)
run: |

View File

@ -264,12 +264,11 @@ jobs:
- name: Run e2e tests
run: |
# TODO(gitbuda): Setup mgclient and pymgclient properly.
cd tests
./setup.sh
source ve3/bin/activate
cd e2e
LD_LIBRARY_PATH=$LD_LIBRARY_PATH:../../libs/mgclient/lib python runner.py --workloads-root-directory .
./run.sh
- name: Run stress test (plain)
run: |
@ -319,10 +318,8 @@ jobs:
run: |
# Activate toolchain.
source /opt/toolchain-v4/activate
# Initialize dependencies.
./init
# Build only memgraph release binary.
cd build
cmake -DCMAKE_BUILD_TYPE=release ..
@ -331,7 +328,7 @@ jobs:
- name: Run Jepsen tests
run: |
cd tests/jepsen
./run.sh test --binary ../../build/memgraph --run-args "test-all --node-configs resources/node-config.edn" --ignore-run-stdout-logs --ignore-run-stderr-logs
./run.sh test-all-individually --binary ../../build/memgraph --ignore-run-stdout-logs --ignore-run-stderr-logs
- name: Save Jepsen report
uses: actions/upload-artifact@v3

View File

@ -264,12 +264,11 @@ jobs:
- name: Run e2e tests
run: |
# TODO(gitbuda): Setup mgclient and pymgclient properly.
cd tests
./setup.sh
source ve3/bin/activate
cd e2e
LD_LIBRARY_PATH=$LD_LIBRARY_PATH:../../libs/mgclient/lib python runner.py --workloads-root-directory .
./run.sh
- name: Run stress test (plain)
run: |

View File

@ -261,6 +261,8 @@ DEFINE_double(query_execution_timeout_sec, 600,
DEFINE_uint64(replication_replica_check_frequency_sec, 1,
"The time duration between two replica checks/pings. If < 1, replicas will NOT be checked at all. NOTE: "
"The MAIN instance allocates a new thread for each REPLICA.");
// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
DEFINE_bool(replication_restore_state_on_startup, false, "Restore replication state on startup, e.g. recover replica");
// NOLINTNEXTLINE (cppcoreguidelines-avoid-non-const-global-variables)
DEFINE_uint64(
@ -891,7 +893,7 @@ int main(int argc, char **argv) {
.wal_file_size_kibibytes = FLAGS_storage_wal_file_size_kib,
.wal_file_flush_every_n_tx = FLAGS_storage_wal_file_flush_every_n_tx,
.snapshot_on_exit = FLAGS_storage_snapshot_on_exit,
.restore_replication_state_on_startup = true,
.restore_replication_state_on_startup = FLAGS_replication_restore_state_on_startup,
.items_per_batch = FLAGS_storage_items_per_batch,
.recovery_thread_count = FLAGS_storage_recovery_thread_count,
.allow_parallel_index_creation = FLAGS_storage_parallel_index_recovery},

View File

@ -162,7 +162,7 @@ InMemoryStorage::InMemoryStorage(Config config)
}
} else {
spdlog::warn(
"Replicastion configuration will NOT be stored. When the server restarts, replication state will be "
"Replication configuration will NOT be stored. When the server restarts, replication state will be "
"forgotten.");
}

View File

@ -350,7 +350,7 @@ uint64_t InMemoryStorage::ReplicationClient::ReplicateCurrentWal() {
/// transactions while Snapshots contain all the data. For that reason we prefer
/// WALs as much as possible. As the WAL file that is currently being updated
/// can change during the process we ignore it as much as possible. Also, it
/// uses the transaction lock so lokcing it can be really expensive. After we
/// uses the transaction lock so locking it can be really expensive. After we
/// fetch the list of finalized WALs, we try to find the longest chain of
/// sequential WALs, starting from the latest one, that will update the recovery
/// with the all missed updates. If the WAL chain cannot be created, replica is

13
tests/e2e/README.md Normal file
View File

@ -0,0 +1,13 @@
# tests/e2e
Framework to run end-to-end tests against Memgraph.
## Notes
* If you change something under this directory and below (even a Python
script), `make` has to be run again because all tests are copied to the build
directory and executed from there.
* Use/extend `run.sh` if you run any e2e tests:
* if all tests have to executed, use `run.sh`
* if a suite of tests have to be execute, take a look under `run.sh` how to do so
* if only a single test have to be execute, take a look at each individual binary/script, it's possible to manually pick the test

View File

@ -187,4 +187,9 @@ startup_config_dict = {
"Path to cypherl file that is used for configuring users and database schema before server starts.",
),
"init_data_file": ("", "", "Path to cypherl file that is used for creating data after server starts."),
"replication_restore_state_on_startup": (
"false",
"false",
"Restore replication state on startup, e.g. recover replica",
),
}

View File

@ -33,13 +33,11 @@
import atexit
import logging
import os
import subprocess
import sys
import tempfile
import time
from argparse import ArgumentParser
from inspect import signature
from pathlib import Path
import yaml
@ -77,9 +75,9 @@ ACTIONS = {
"info": lambda context: info(context),
"stop": lambda context, name: stop(context, name),
"start": lambda context, name: start(context, name),
"sleep": lambda context, delta: time.sleep(float(delta)),
"exit": lambda context: sys.exit(1),
"quit": lambda context: sys.exit(1),
"sleep": lambda _, delta: time.sleep(float(delta)),
"exit": lambda _: sys.exit(1),
"quit": lambda _: sys.exit(1),
}
log = logging.getLogger("memgraph.tests.e2e")

View File

@ -13,7 +13,6 @@ import copy
import os
import subprocess
import sys
import tempfile
import time
import mgclient

View File

@ -147,27 +147,33 @@ def test_basic_recovery(connection):
data_directory = tempfile.TemporaryDirectory()
CONFIGURATION = {
"replica_1": {
"args": ["--bolt-port", "7688", "--log-level=TRACE"],
"args": ["--bolt-port", "7688", "--log-level=TRACE", "--replication-restore-state-on-startup=true"],
"log_file": "replica1.log",
"setup_queries": ["SET REPLICATION ROLE TO REPLICA WITH PORT 10001;"],
},
"replica_2": {
"args": ["--bolt-port", "7689", "--log-level=TRACE"],
"args": ["--bolt-port", "7689", "--log-level=TRACE", "--replication-restore-state-on-startup=true"],
"log_file": "replica2.log",
"setup_queries": ["SET REPLICATION ROLE TO REPLICA WITH PORT 10002;"],
},
"replica_3": {
"args": ["--bolt-port", "7690", "--log-level=TRACE"],
"args": ["--bolt-port", "7690", "--log-level=TRACE", "--replication-restore-state-on-startup=true"],
"log_file": "replica3.log",
"setup_queries": ["SET REPLICATION ROLE TO REPLICA WITH PORT 10003;"],
},
"replica_4": {
"args": ["--bolt-port", "7691", "--log-level=TRACE"],
"args": ["--bolt-port", "7691", "--log-level=TRACE", "--replication-restore-state-on-startup=true"],
"log_file": "replica4.log",
"setup_queries": ["SET REPLICATION ROLE TO REPLICA WITH PORT 10004;"],
},
"main": {
"args": ["--bolt-port", "7687", "--log-level=TRACE", "--storage-recover-on-startup=true"],
"args": [
"--bolt-port",
"7687",
"--log-level=TRACE",
"--storage-recover-on-startup=true",
"--replication-restore-state-on-startup=true",
],
"log_file": "main.log",
"setup_queries": [],
"data_directory": f"{data_directory.name}",
@ -359,13 +365,19 @@ def test_replication_role_recovery(connection):
data_directory = tempfile.TemporaryDirectory()
CONFIGURATION = {
"replica": {
"args": ["--bolt-port", "7688", "--log-level=TRACE"],
"args": ["--bolt-port", "7688", "--log-level=TRACE", "--replication-restore-state-on-startup=true"],
"log_file": "replica.log",
"setup_queries": ["SET REPLICATION ROLE TO REPLICA WITH PORT 10001;"],
"data_directory": f"{data_directory.name}/replica",
},
"main": {
"args": ["--bolt-port", "7687", "--log-level=TRACE", "--storage-recover-on-startup=true"],
"args": [
"--bolt-port",
"7687",
"--log-level=TRACE",
"--storage-recover-on-startup=true",
"--replication-restore-state-on-startup=true",
],
"log_file": "main.log",
"setup_queries": [],
"data_directory": f"{data_directory.name}/main",
@ -381,13 +393,19 @@ def test_replication_role_recovery(connection):
# When we restart the replica, it does not need this query anymore since it needs to remember state
CONFIGURATION = {
"replica": {
"args": ["--bolt-port", "7688", "--log-level=TRACE"],
"args": ["--bolt-port", "7688", "--log-level=TRACE", "--replication-restore-state-on-startup=true"],
"log_file": "replica.log",
"setup_queries": [],
"data_directory": f"{data_directory.name}/replica",
},
"main": {
"args": ["--bolt-port", "7687", "--log-level=TRACE", "--storage-recover-on-startup=true"],
"args": [
"--bolt-port",
"7687",
"--log-level=TRACE",
"--storage-recover-on-startup=true",
"--replication-restore-state-on-startup=true",
],
"log_file": "main.log",
"setup_queries": [],
"data_directory": f"{data_directory.name}/main",
@ -511,17 +529,23 @@ def test_basic_recovery_when_replica_is_kill_when_main_is_down():
data_directory = tempfile.TemporaryDirectory()
CONFIGURATION = {
"replica_1": {
"args": ["--bolt-port", "7688", "--log-level=TRACE"],
"args": ["--bolt-port", "7688", "--log-level=TRACE", "--replication-restore-state-on-startup=true"],
"log_file": "replica1.log",
"setup_queries": ["SET REPLICATION ROLE TO REPLICA WITH PORT 10001;"],
},
"replica_2": {
"args": ["--bolt-port", "7689", "--log-level=TRACE"],
"args": ["--bolt-port", "7689", "--log-level=TRACE", "--replication-restore-state-on-startup=true"],
"log_file": "replica2.log",
"setup_queries": ["SET REPLICATION ROLE TO REPLICA WITH PORT 10002;"],
},
"main": {
"args": ["--bolt-port", "7687", "--log-level=TRACE", "--storage-recover-on-startup=true"],
"args": [
"--bolt-port",
"7687",
"--log-level=TRACE",
"--storage-recover-on-startup=true",
"--replication-restore-state-on-startup=true",
],
"log_file": "main.log",
"setup_queries": [],
"data_directory": f"{data_directory.name}",

37
tests/e2e/run.sh Executable file
View File

@ -0,0 +1,37 @@
#!/bin/bash
# TODO(gitbuda): Setup mgclient and pymgclient properly.
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:../../libs/mgclient/lib
print_help() {
echo -e "$0 ["workload name string"]"
echo -e ""
echo -e " NOTE: some tests require enterprise licence key,"
echo -e " to run those define the folowing env vars:"
echo -e " * MEMGRAPH_ORGANIZATION_NAME"
echo -e " * MEMGRAPH_ENTERPRISE_LICENSE"
exit 1
}
check_license() {
if [ ! -v MEMGRAPH_ORGANIZATION_NAME ] || [ ! -v MEMGRAPH_ENTERPRISE_LICENSE ]; then
echo "NOTE: MEMGRAPH_ORGANIZATION_NAME or MEMGRAPH_ENTERPRISE_LICENSE NOT defined -> dependent tests will NOT work"
fi
}
if [ "$#" -eq 0 ]; then
check_license
# NOTE: If you want to run all tests under specific folder/section just
# replace the dot (root directory below) with the folder name, e.g.
# `--workloads-root-directory replication`.
python3 runner.py --workloads-root-directory .
elif [ "$#" -eq 1 ]; then
if [ "$1" == "-h" ] || [ "$1" == "--help" ]; then
print_help
fi
check_license
# NOTE: --workload-name comes from each individual folder/section
# workloads.yaml file. E.g. `streams/workloads.yaml` has a list of
# `workloads:` and each workload has it's `-name`.
python3 runner.py --workloads-root-directory . --workload-name "$1"
else
print_help
fi

View File

@ -1,4 +0,0 @@
#!/bin/bash
# TODO: andi as a side project
python3 runner.py --workloads-root-directory disk_storage

View File

@ -5,14 +5,6 @@ test_transaction_queue: &test_transaction_queue
log_file: "transaction_queue.log"
setup_queries: []
validation_queries: []
disk_test_transaction_queue: &disk_test_transaction_queue
cluster:
main:
args: ["--bolt-port", "7687", "--log-level=TRACE", "--also-log-to-stderr"]
log_file: "transaction_queue.log"
setup_queries: ["STORAGE MODE ON_DISK_TRANSACTIONAL"]
validation_queries: []
workloads:
- name: "test-transaction-queue" # should be the same as the python file
@ -20,8 +12,3 @@ workloads:
proc: "tests/e2e/transaction_queue/procedures/"
args: ["transaction_queue/test_transaction_queue.py"]
<<: *test_transaction_queue
- name: "test-transaction-queue on disk" # should be the same as the python file
binary: "tests/e2e/pytest_runner.sh"
proc: "tests/e2e/transaction_queue/procedures/"
args: ["transaction_queue/test_transaction_queue.py"]
<<: *disk_test_transaction_queue

View File

@ -2,3 +2,31 @@
NOTE: Jepsen can only connect to the SSH server on the default 22 port.
`--node` flag only takes the actual address (:port doesn't work).
Jepsen run under CI:
```
cd tests/jepsen
./run.sh test --binary ../../build/memgraph --run-args "test-all --node-configs resources/node-config.edn" --ignore-run-stdout-logs --ignore-run-stderr-logs
```
Local run of each test (including setup):
```
cd tests/jepsen
./run.sh cluster-up
docker exec -it jepsen-control bash
cd memgraph
lein run test --workload bank --node-configs resources/node-config.edn
lein run test --workload large --node-configs resources/node-config.edn
```
Logs are located under `jepsen-control:/jepsen/memgraph/store`.
If you setup cluster manually go to jepsen-control Docker container and ssh to all cluster nodes to save their host keys in known_hosts.
```
docker exec -it jepsen-control bash
ssh n1 -> yes -> exit
ssh n2 -> yes -> exit
ssh n3 -> yes -> exit
ssh n4 -> yes -> exit
ssh n5 -> yes -> exit
```

View File

@ -0,0 +1,13 @@
diff --git a/docker/control/Dockerfile b/docker/control/Dockerfile
index 6b2d3c0e..195a7a60 100644
--- a/docker/control/Dockerfile
+++ b/docker/control/Dockerfile
@@ -7,7 +7,7 @@ ENV LEIN_ROOT true
# Jepsen dependencies
#
RUN apt-get -y -q update && \
- apt-get install -qy openjdk-17-jdk-headless \
+ apt-get install -qy ca-certificates-java openjdk-17-jdk-headless \
libjna-java \
vim \
emacs \

View File

@ -5,7 +5,10 @@
:url "https://github.com/memgraph/memgraph/blob/master/release/LICENSE_ENTERPRISE.md"}
:main jepsen.memgraph.core
:dependencies [[org.clojure/clojure "1.10.0"]
[jepsen "0.2.1-SNAPSHOT"]
;; 0.2.4-SNAPSHOT but 0.3.0, for more -> https://clojars.org/jepsen/versions
[jepsen "0.2.4-SNAPSHOT"]
[gorillalabs/neo4j-clj "4.1.0"]]
:profiles {:test {:dependencies [#_[org.neo4j.test/neo4j-harness "4.1.0"]]}}
;; required to run 0.3.0
; :aot :all
:repl-options {:init-ns jepsen.memgraph.core})

View File

@ -1,16 +1,18 @@
#!/bin/bash
set -Eeuo pipefail
script_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
MEMGRAPH_BINARY_PATH="../../build/memgraph"
# NOTE: On Ubuntu 22.04 0.3.2 uses non-existing docker compose --compatibility flag.
# NOTE: On Ubuntu 22.04 0.3.1 seems to be working.
JEPSEN_VERSION="${JEPSEN_VERSION:-v0.3.0}"
# NOTE: Jepsen Git tags are not consistent, there are: 0.2.4, v0.3.0, 0.3.2, ...
# NOTE: On Ubuntu 22.04 v0.3.2 uses non-existing docker compose --compatibility flag.
# NOTE: On Ubuntu 22.04 v0.3.0 and v0.3.1 seems to be runnable.
# TODO(gitbuda): Make sure Memgraph can be testes with Jepsen >= 0.3.0
JEPSEN_VERSION="${JEPSEN_VERSION:-0.2.4}"
JEPSEN_ACTIVE_NODES_NO=5
CONTROL_LEIN_RUN_ARGS="test-all --node-configs resources/node-config.edn"
CONTROL_LEIN_RUN_STDOUT_LOGS=1
CONTROL_LEIN_RUN_STDERR_LOGS=1
_JEPSEN_RUN_EXIT_STATUS=0
PRINT_CONTEXT() {
echo -e "MEMGRAPH_BINARY_PATH:\t\t $MEMGRAPH_BINARY_PATH"
echo -e "JEPSEN_VERSION:\t\t\t $JEPSEN_VERSION"
@ -22,7 +24,7 @@ PRINT_CONTEXT() {
HELP_EXIT() {
echo ""
echo "HELP: $0 help|cluster-up|test [args]"
echo "HELP: $0 help|cluster-up|cluster-cleanup|cluster-dealloc|mgbuild|test|test-all-individually [args]"
echo ""
echo " test args --binary MEMGRAPH_BINARY_PATH"
echo " --ignore-run-stdout-logs Ignore lein run stdout logs."
@ -45,153 +47,235 @@ if ! command -v docker > /dev/null 2>&1 || ! command -v docker-compose > /dev/nu
ERROR "docker and docker-compose have to be installed."
exit 1
fi
PRINT_CONTEXT
if [ ! -d "$script_dir/jepsen" ]; then
git clone https://github.com/jepsen-io/jepsen.git -b "$JEPSEN_VERSION" "$script_dir/jepsen"
if [ "$JEPSEN_VERSION" == "v0.3.0" ]; then
if [ -f "$script_dir/jepsen_0.3.0.patch" ]; then
cd "$script_dir/jepsen"
git apply "$script_dir/jepsen_0.3.0.patch"
cd "$script_dir"
fi
fi
fi
if [ "$#" -lt 1 ]; then
HELP_EXIT
fi
PROCESS_ARGS() {
shift
while [[ $# -gt 0 ]]; do
key="$1"
case $key in
--binary)
shift
MEMGRAPH_BINARY_PATH="$1"
shift
;;
--ignore-run-stdout-logs)
CONTROL_LEIN_RUN_STDOUT_LOGS=0
shift
;;
--ignore-run-stderr-logs)
CONTROL_LEIN_RUN_STDERR_LOGS=0
shift
;;
--nodes-no)
shift
JEPSEN_ACTIVE_NODES_NO="$1"
shift
;;
--run-args)
shift
CONTROL_LEIN_RUN_ARGS="$1"
shift
;;
*)
ERROR "Unknown option $1."
HELP_EXIT
;;
esac
done
}
COPY_BINARIES() {
# Copy Memgraph binary, handles both cases, when binary is a sym link
# or a regular file.
binary_path="$MEMGRAPH_BINARY_PATH"
if [ -L "$binary_path" ]; then
binary_path=$(readlink "$binary_path")
fi
binary_name=$(basename -- "$binary_path")
for iter in $(seq 1 "$JEPSEN_ACTIVE_NODES_NO"); do
jepsen_node_name="jepsen-n$iter"
docker_exec="docker exec $jepsen_node_name bash -c"
if [ "$binary_name" == "memgraph" ]; then
_binary_name="memgraph_tmp"
else
_binary_name="$binary_name"
fi
$docker_exec "rm -rf /opt/memgraph/ && mkdir -p /opt/memgraph"
docker cp "$binary_path" "$jepsen_node_name":/opt/memgraph/"$_binary_name"
$docker_exec "ln -s /opt/memgraph/$_binary_name /opt/memgraph/memgraph"
$docker_exec "touch /opt/memgraph/memgraph.log"
INFO "Copying $binary_name to $jepsen_node_name DONE."
done
# Copy test files into the control node.
docker exec jepsen-control mkdir -p /jepsen/memgraph/store
docker cp "$script_dir/src/." jepsen-control:/jepsen/memgraph/src/
docker cp "$script_dir/test/." jepsen-control:/jepsen/memgraph/test/
docker cp "$script_dir/resources/." jepsen-control:/jepsen/memgraph/resources/
docker cp "$script_dir/project.clj" jepsen-control:/jepsen/memgraph/project.clj
INFO "Copying test files to jepsen-control DONE."
}
RUN_JEPSEN() {
__control_lein_run_args="$1"
# NOTE: docker exec -t is NOT ok because gh CI user does NOT have TTY.
# NOTE: ~/.bashrc has to be manually sourced when bash -c is used
# because some Jepsen config is there.
# To be able to archive the run result even if the run fails.
set +e
if [ "$CONTROL_LEIN_RUN_STDOUT_LOGS" -eq 0 ]; then
redirect_stdout_logs="/dev/null"
else
redirect_stdout_logs="/dev/stdout"
fi
if [ "$CONTROL_LEIN_RUN_STDERR_LOGS" -eq 0 ]; then
redirect_stderr_logs="/dev/null"
else
redirect_stderr_logs="/dev/stderr"
fi
docker exec jepsen-control bash -c "source ~/.bashrc && cd memgraph && lein run $__control_lein_run_args" 1> $redirect_stdout_logs 2> $redirect_stderr_logs
_JEPSEN_RUN_EXIT_STATUS=$?
set -e
}
PROCESS_RESULTS() {
start_time="$1"
end_time="$2"
INFO "Process results..."
# Print and pack all test workload runs between start and end time.
all_workloads=$(docker exec jepsen-control bash -c 'ls /jepsen/memgraph/store/' | grep test-)
all_workload_run_folders=""
for workload in $all_workloads; do
for time_folder in $(docker exec jepsen-control bash -c "ls /jepsen/memgraph/store/$workload"); do
if [[ "$time_folder" == "latest" ]]; then
continue
fi
# The early continue pattern here is nice because bash doesn't
# have >= for the string comparison (marginal values).
if [[ "$time_folder" < "$start_time" ]]; then
continue
fi
if [[ "$time_folder" > "$end_time" ]]; then
continue
fi
INFO "jepsen.log for $workload/$time_folder"
docker exec jepsen-control bash -c "tail -n 50 /jepsen/memgraph/store/$workload/$time_folder/jepsen.log"
all_workload_run_folders="$all_workload_run_folders /jepsen/memgraph/store/$workload/$time_folder"
done
done
INFO "Packing results..."
docker exec jepsen-control bash -c "tar -czvf /jepsen/memgraph/Jepsen.tar.gz $all_workload_run_folders"
docker cp jepsen-control:/jepsen/memgraph/Jepsen.tar.gz ./
INFO "Result processing (printing and packing) DONE."
}
# Initialize testing context by copying source/binary files. Inside CI,
# Memgraph is tested on a single machine cluster based on Docker containers.
# Once these tests will be part of the official Jepsen repo, the majority of
# functionalities inside this script won't be needed because each node clones
# the public repo.
case $1 in
help)
HELP_EXIT
;;
# Start Jepsen Docker cluster of 5 nodes. To configure the cluster please
# take a look under jepsen/docker/docker-compose.yml.
# NOTE: If you delete the jepsen folder where docker config is located,
# the current cluster is broken because it relies on the folder. That can
# happen easiliy because the jepsen folder is git ignored.
cluster-up)
PRINT_CONTEXT
"$script_dir/jepsen/docker/bin/up" --daemon
;;
# Run tests against the specified Memgraph binary.
test)
shift
while [[ $# -gt 0 ]]; do
key="$1"
case $key in
--binary)
shift
MEMGRAPH_BINARY_PATH="$1"
shift
;;
--ignore-run-stdout-logs)
CONTROL_LEIN_RUN_STDOUT_LOGS=0
shift
;;
--ignore-run-stderr-logs)
CONTROL_LEIN_RUN_STDERR_LOGS=0
shift
;;
--nodes-no)
shift
JEPSEN_ACTIVE_NODES_NO="$1"
shift
;;
--run-args)
shift
CONTROL_LEIN_RUN_ARGS="$1"
shift
;;
*)
ERROR "Unknown option $1."
HELP_EXIT
;;
esac
done
# Copy Memgraph binary, handles both cases, when binary is a sym link
# or a regular file.
binary_path="$MEMGRAPH_BINARY_PATH"
if [ -L "$binary_path" ]; then
binary_path=$(readlink "$binary_path")
fi
binary_name=$(basename -- "$binary_path")
cluster-cleanup)
jepsen_control_exec="docker exec jepsen-control bash -c"
INFO "Deleting /jepsen/memgraph/store/* on jepsen-control"
$jepsen_control_exec "rm -rf /jepsen/memgraph/store/*"
for iter in $(seq 1 "$JEPSEN_ACTIVE_NODES_NO"); do
jepsen_node_name="jepsen-n$iter"
docker_exec="docker exec $jepsen_node_name bash -c"
if [ "$binary_name" == "memgraph" ]; then
_binary_name="memgraph_tmp"
else
_binary_name="$binary_name"
fi
$docker_exec "rm -rf /opt/memgraph/ && mkdir -p /opt/memgraph"
docker cp "$binary_path" "$jepsen_node_name":/opt/memgraph/"$_binary_name"
$docker_exec "ln -s /opt/memgraph/$_binary_name /opt/memgraph/memgraph"
$docker_exec "touch /opt/memgraph/memgraph.log"
INFO "Copying $binary_name to $jepsen_node_name DONE."
jepsen_node_exec="docker exec $jepsen_node_name bash -c"
INFO "Deleting /opt/memgraph/* on $jepsen_node_name"
$jepsen_node_exec "rm -rf /opt/memgraph/*"
done
;;
# Copy test files into the control node.
docker exec jepsen-control mkdir -p /jepsen/memgraph
docker cp "$script_dir/src/." jepsen-control:/jepsen/memgraph/src/
docker cp "$script_dir/test/." jepsen-control:/jepsen/memgraph/test/
docker cp "$script_dir/resources/." jepsen-control:/jepsen/memgraph/resources/
docker cp "$script_dir/project.clj" jepsen-control:/jepsen/memgraph/project.clj
INFO "Copying test files to jepsen-control DONE."
cluster-dealloc)
ps=$(docker ps --filter name=jepsen* --filter status=running -q)
if [[ ! -z ${ps} ]]; then
echo "Killing ${ps}"
docker rm -f ${ps}
imgs=$(docker images "jepsen*" -q)
if [[ ! -z ${imgs} ]]; then
echo "Removing ${imgs}"
docker images "jepsen*" -q | xargs docker image rmi -f
else
echo "No Jepsen images detected!"
fi
else
echo "No Jepsen containers detected!"
fi
;;
mgbuild)
PRINT_CONTEXT
echo ""
echo "TODO(gitbuda): Build memgraph for Debian 10 via memgraph/memgraph-builder"
exit 1
;;
test)
PROCESS_ARGS "$@"
PRINT_CONTEXT
COPY_BINARIES
start_time="$(docker exec jepsen-control bash -c 'date -u +"%Y%m%dT%H%M%S"').000Z"
# Run the test.
# NOTE: docker exec -t is NOT ok because gh CI user does NOT have TTY.
# NOTE: ~/.bashrc has to be manually sourced when bash -c is used
# because some Jepsen config is there.
set +e
if [ "$CONTROL_LEIN_RUN_STDOUT_LOGS" -eq 0 ]; then
redirect_stdout_logs="/dev/null"
else
redirect_stdout_logs="/dev/stdout"
fi
if [ "$CONTROL_LEIN_RUN_STDERR_LOGS" -eq 0 ]; then
redirect_stderr_logs="/dev/null"
else
redirect_stderr_logs="/dev/stderr"
fi
INFO "Jepsen run in progress... START_TIME: $start_time"
docker exec jepsen-control bash -c "source ~/.bashrc && cd memgraph && lein run $CONTROL_LEIN_RUN_ARGS" 1> $redirect_stdout_logs 2> $redirect_stderr_logs
# To be able to archive the run result even if the run fails.
jepsen_run_exit_status=$?
RUN_JEPSEN "$CONTROL_LEIN_RUN_ARGS"
end_time="$(docker exec jepsen-control bash -c 'date -u +"%Y%m%dT%H%M%S"').000Z"
INFO "Jepsen run DONE. END_TIME: $end_time"
set -e
# Pack all test workload runs between start and end time.
all_workloads=$(docker exec jepsen-control bash -c 'ls /jepsen/memgraph/store/' | grep test-)
all_workload_run_folders=""
for workload in $all_workloads; do
for time_folder in $(docker exec jepsen-control bash -c "ls /jepsen/memgraph/store/$workload"); do
if [[ "$time_folder" == "latest" ]]; then
continue
fi
# The early continue pattern here is nice because bash doesn't
# have >= for the string comparison (marginal values).
if [[ "$time_folder" < "$start_time" ]]; then
continue
fi
if [[ "$time_folder" > "$end_time" ]]; then
continue
fi
all_workload_run_folders="$all_workload_run_folders /jepsen/memgraph/store/$workload/$time_folder"
done
done
docker exec jepsen-control bash -c "tar -czvf /jepsen/memgraph/Jepsen.tar.gz $all_workload_run_folders"
docker cp jepsen-control:/jepsen/memgraph/Jepsen.tar.gz ./
INFO "Test and results packing DONE."
# If the run has failed, this script also has to return non-zero status.
if [ "$jepsen_run_exit_status" -ne 0 ]; then
exit "$jepsen_run_exit_status"
PROCESS_RESULTS "$start_time" "$end_time"
# Exit if the jepsen run status is not 0
if [ "$_JEPSEN_RUN_EXIT_STATUS" -ne 0 ]; then
ERROR "Jepsen FAILED" # important for the coder
exit "$_JEPSEN_RUN_EXIT_STATUS" # important for CI
fi
;;
test-all-individually)
PROCESS_ARGS "$@"
PRINT_CONTEXT
INFO "NOTE: CONTROL_LEIN_RUN_ARGS ignored"
COPY_BINARIES
start_time="$(docker exec jepsen-control bash -c 'date -u +"%Y%m%dT%H%M%S"').000Z"
INFO "Jepsen run in progress... START_TIME: $start_time"
for workload in "bank" "large"; do
RUN_JEPSEN "test --workload $workload --node-configs resources/node-config.edn"
if [ "$_JEPSEN_RUN_EXIT_STATUS" -ne 0 ]; then
break
fi
done
end_time="$(docker exec jepsen-control bash -c 'date -u +"%Y%m%dT%H%M%S"').000Z"
INFO "Jepsen run DONE. END_TIME: $end_time"
PROCESS_RESULTS "$start_time" "$end_time"
# Exit if the jepsen run status is not 0
if [ "$_JEPSEN_RUN_EXIT_STATUS" -ne 0 ]; then
ERROR "Jepsen FAILED" # important for the coder
exit "$_JEPSEN_RUN_EXIT_STATUS" # important for CI
fi
;;
*)
HELP_EXIT
HELP_EXIT
;;
esac

View File

@ -146,9 +146,23 @@
["-w" "--workload NAME" "Test workload to run"
:parse-fn keyword
:validate [workloads (cli/one-of workloads)]]
[nil "--node-configs PATH" "Path to the node configuration file."
[nil "--node-configs PATH" "Path to a file containing a list of node config."
:parse-fn #(-> % e/load-configuration)]])
(defn single-test
"Takes base CLI options and constructs a single test."
[opts]
(let [workload (if (:workload opts)
(:workload opts)
(throw (Exception. "Workload undefined")))
node-config (if (:node-configs opts)
(first (merge-node-configurations (:nodes opts) (list (first (:node-configs opts)))))
(throw (Exception. "Node configs undefined")))
test-opts (assoc opts
:node-config node-config
:workload workload)]
(memgraph-test test-opts)))
(defn all-tests
"Takes base CLI options and constructs a sequence of test options."
[opts]
@ -169,7 +183,7 @@
[& args]
(cli/run! (merge (cli/test-all-cmd {:tests-fn all-tests
:opt-spec cli-opts})
(cli/single-test-cmd {:test-fn memgraph-test
(cli/single-test-cmd {:test-fn single-test
:opt-spec cli-opts})
(cli/serve-cmd))
args))

View File

@ -1,7 +1,7 @@
(ns jepsen.memgraph.nemesis
"Memgraph nemesis"
(:require [jepsen [nemesis :as nemesis]
[generator :as gen]]
[generator :as gen]]
[jepsen.memgraph.support :as s]))
(defn node-killer
@ -16,10 +16,10 @@
"Can kill and restart all processess and initiate network partitions."
[opts]
(nemesis/compose
{{:kill-node :start
:restart-node :stop} (node-killer)
{:start-partition-halves :start
:stop-partition-halves :stop} (nemesis/partition-random-halves)}))
{{:kill-node :start
:restart-node :stop} (node-killer)
{:start-partition-halves :start
:stop-partition-halves :stop} (nemesis/partition-random-halves)}))
(defn op
"Construct a nemesis op"
@ -36,7 +36,7 @@
(apply concat)
gen/mix
(gen/stagger (:interval opts))
(gen/phases (gen/sleep 10))))
(gen/phases (gen/sleep 60))))
(defn nemesis
"Composite nemesis and generator"

View File

@ -2,8 +2,8 @@
(:require [clojure.string :as str]
[clojure.tools.logging :refer [info]]
[jepsen [db :as db]
[control :as c]
[util :as util :refer [meh]]]
[control :as c]
[util :as util :refer [meh]]]
[jepsen.control.util :as cu]
[jepsen.os.debian :as debian]))
@ -44,7 +44,7 @@
(throw (Exception. (str local-binary " is not there.")))))
(info node "Memgraph binary is there" local-binary)
(start-node! test node)
(Thread/sleep 2000)))
(Thread/sleep 5000))) ;; TODO(gitbuda): The sleep after Jepsen starting Memgraph is for sure questionable.
(teardown! [_ test node]
(info node "Tearing down Memgraph")
(stop-node! test node)