memgraph/tests/manual/distributed_repl.cpp
Matej Ferencevic 53c405c699 Throw exceptions on RPC failure and Distributed error handling
Summary:
This diff changes the RPC layer to directly return `TResponse` to the user when
issuing a `Call<...>` RPC call. The call throws an exception on failure
(instead of the previous return `nullopt`).

All servers (network, RPC and distributed) are set to have explicit `Shutdown`
methods so that a controlled shutdown can always be performed. The object
destructors now have `CHECK`s to enforce that the `AwaitShutdown` methods were
called.

The distributed memgraph is changed that none of the binaries (master/workers)
crash when there is a communication failure. Instead, the whole cluster starts
a graceful shutdown when a persistent communication error is detected.
Transient errors are allowed during execution. The transaction that errored out
will be aborted on the whole cluster. The cluster state is managed using a new
Heartbeat RPC call.

Reviewers: buda, teon.banek, msantl

Reviewed By: teon.banek

Subscribers: pullbot

Differential Revision: https://phabricator.memgraph.io/D1604
2018-09-27 16:27:40 +02:00

87 lines
2.6 KiB
C++

#include <chrono>
#include <experimental/filesystem>
#include <iostream>
#include <memory>
#include <thread>
#include <gflags/gflags.h>
#include <glog/logging.h>
#include <gtest/gtest.h>
#include "database/distributed_graph_db.hpp"
#include "query/distributed_interpreter.hpp"
#include "query/repl.hpp"
#include "utils/file.hpp"
#include "utils/flag_validation.hpp"
DEFINE_VALIDATED_int32(worker_count, 1,
"The number of worker nodes in cluster.",
FLAG_IN_RANGE(1, 1000));
DECLARE_int32(min_log_level);
DECLARE_string(durability_directory);
namespace fs = std::experimental::filesystem;
const std::string kLocal = "127.0.0.1";
class WorkerInThread {
public:
explicit WorkerInThread(database::Config config) : worker_(config) {
thread_ =
std::thread([this, config] { EXPECT_TRUE(worker_.AwaitShutdown()); });
}
~WorkerInThread() {
if (thread_.joinable()) thread_.join();
}
database::Worker worker_;
std::thread thread_;
};
fs::path GetDurabilityDirectory(const fs::path &path, int worker_id) {
if (worker_id == 0) return path / "master";
return path / fmt::format("worker{}", worker_id);
}
int main(int argc, char *argv[]) {
gflags::ParseCommandLineFlags(&argc, &argv, true);
FLAGS_min_log_level = google::ERROR;
google::InitGoogleLogging(argv[0]);
fs::path tmp_dir =
fs::temp_directory_path() / "MG_test_manual_distributed_repl";
EXPECT_TRUE(utils::EnsureDir(tmp_dir));
// Start the master
database::Config master_config;
master_config.master_endpoint = {kLocal, 0};
master_config.durability_directory = GetDurabilityDirectory(tmp_dir, 0);
// Flag needs to be updated due to props on disk storage.
FLAGS_durability_directory = GetDurabilityDirectory(tmp_dir, 0);
auto master = std::make_unique<database::Master>(master_config);
// Allow the master to get initialized before making workers.
std::this_thread::sleep_for(std::chrono::milliseconds(250));
std::vector<std::unique_ptr<WorkerInThread>> workers;
for (int i = 0; i < FLAGS_worker_count; ++i) {
database::Config config;
config.worker_id = i + 1;
config.master_endpoint = master->endpoint();
config.worker_endpoint = {kLocal, 0};
config.durability_directory = GetDurabilityDirectory(tmp_dir, i + 1);
// Flag needs to be updated due to props on disk storage.
FLAGS_durability_directory = GetDurabilityDirectory(tmp_dir, i + 1);
workers.emplace_back(std::make_unique<WorkerInThread>(config));
}
// Start the REPL
{
query::DistributedInterpreter interpreter(master.get());
query::Repl(master.get(), &interpreter);
}
master = nullptr;
return 0;
}