Remove distributed

Reviewers: teon.banek

Reviewed By: teon.banek

Subscribers: pullbot

Differential Revision:
This commit is contained in:
Matej Ferencevic 2019-07-17 12:32:22 +02:00
parent 59b3f84eb9
commit 111dd8bf19
213 changed files with 13 additions and 31482 deletions

View File

@ -1,9 +1,9 @@
- name: Binaries
- build_debug/memgraph
- build_debug/memgraph_distributed
- build_debug/memgraph_ha
- build_release/memgraph
- build_release/memgraph_distributed
- build_release/memgraph_ha
- build_release/tools/src/mg_client
- build_release/tools/src/mg_import_csv
- build_release/tools/src/mg_statsd

View File

@ -25,7 +25,7 @@
# Build coverage binaries.
cd ..
# TODO: uncomment this build once single node, ha and distributed are split
# TODO: uncomment this build once single node and ha are split
# mkdir build_coverage
# cd build_coverage
@ -37,12 +37,8 @@
mkdir build_release
cd build_release
cmake -DCMAKE_BUILD_TYPE=release ..
TIMEOUT=1200 make -j$THREADS memgraph memgraph_distributed memgraph_ha tools memgraph__macro_benchmark memgraph__stress memgraph__manual__card_fraud_generate_snapshot memgraph__feature_benchmark__kafka__benchmark memgraph__feature_benchmark__ha__read__benchmark memgraph__feature_benchmark__ha__write__benchmark
# Generate distributed card fraud dataset.
cd ../tests/distributed/card_fraud
cd ../../..
TIMEOUT=1200 make -j$THREADS
cd ..
# Checkout to parent commit and initialize.
cd ../parent
@ -78,7 +74,7 @@
# Build coverage binaries.
cd ..
# TODO: uncomment this build once single node, ha and distributed are split
# TODO: uncomment this build once single node and ha are split
# mkdir build_coverage
# cd build_coverage
@ -96,7 +92,3 @@
mkdir output
cd output
cpack -G DEB --config ../CPackConfig.cmake
# Generate distributed card fraud dataset.
cd ../../tests/distributed/card_fraud

View File

@ -109,135 +109,6 @@ target_compile_definitions(mg-single-node PUBLIC MG_SINGLE_NODE)
# END Memgraph Single Node
# ----------------------------------------------------------------------------
# ----------------------------------------------------------------------------
# Memgraph Distributed
# ----------------------------------------------------------------------------
# -----------------------------------------------------------------------------
define_add_lcp(add_lcp_distributed mg_distributed_sources generated_lcp_distributed_files)
add_lcp_distributed(database/distributed/serialization.lcp SLK_SERIALIZE
DEPENDS durability/distributed/state_delta.lcp)
add_lcp_distributed(distributed/bfs_rpc_messages.lcp SLK_SERIALIZE)
add_lcp_distributed(distributed/coordination_rpc_messages.lcp SLK_SERIALIZE)
add_lcp_distributed(distributed/data_rpc_messages.lcp SLK_SERIALIZE)
add_lcp_distributed(distributed/durability_rpc_messages.lcp SLK_SERIALIZE)
add_lcp_distributed(distributed/index_rpc_messages.lcp SLK_SERIALIZE)
add_lcp_distributed(distributed/plan_rpc_messages.lcp SLK_SERIALIZE)
add_lcp_distributed(distributed/pull_produce_rpc_messages.lcp SLK_SERIALIZE
DEPENDS transactions/distributed/serialization.lcp)
add_lcp_distributed(distributed/storage_gc_rpc_messages.lcp SLK_SERIALIZE)
add_lcp_distributed(distributed/token_sharing_rpc_messages.lcp SLK_SERIALIZE)
add_lcp_distributed(distributed/updates_rpc_messages.lcp SLK_SERIALIZE)
add_lcp_distributed(distributed/dynamic_worker_rpc_messages.lcp SLK_SERIALIZE)
add_lcp_distributed(query/distributed/frontend/ast/ast_serialization.lcp SLK_SERIALIZE
DEPENDS query/frontend/ast/ast.lcp)
add_lcp_distributed(query/distributed/frontend/semantic/symbol_serialization.lcp SLK_SERIALIZE
DEPENDS query/frontend/semantic/symbol.lcp)
add_lcp_distributed(query/distributed/plan/ops.lcp SLK_SERIALIZE
DEPENDS query/plan/operator.lcp)
add_lcp_distributed(storage/distributed/rpc/concurrent_id_mapper_rpc_messages.lcp SLK_SERIALIZE)
add_lcp_distributed(transactions/distributed/engine_rpc_messages.lcp SLK_SERIALIZE
DEPENDS transactions/distributed/serialization.lcp)
add_custom_target(generate_lcp_distributed DEPENDS generate_lcp_common ${generated_lcp_distributed_files})
set(MG_DISTRIBUTED_LIBS stdc++fs Threads::Threads fmt cppitertools
antlr_opencypher_parser_lib dl glog gflags
mg-utils mg-io mg-integrations-kafka mg-requests
mg-communication mg-comm-rpc mg-auth)
# STATIC library used by memgraph executables
add_library(mg-distributed STATIC ${mg_distributed_sources})
target_link_libraries(mg-distributed ${MG_DISTRIBUTED_LIBS})
add_dependencies(mg-distributed generate_opencypher_parser)
add_dependencies(mg-distributed generate_lcp_distributed)
target_compile_definitions(mg-distributed PUBLIC MG_DISTRIBUTED)
# ----------------------------------------------------------------------------
# END Memgraph Distributed
# ----------------------------------------------------------------------------
# ----------------------------------------------------------------------------
# Memgraph Single Node High Availability
# ----------------------------------------------------------------------------
@ -326,7 +197,7 @@ target_compile_definitions(mg-single-node-ha PUBLIC MG_SINGLE_NODE_HA)
# ----------------------------------------------------------------------------
add_dependencies(generate_lcp generate_lcp_single_node generate_lcp_single_node_ha generate_lcp_distributed)
add_dependencies(generate_lcp generate_lcp_single_node generate_lcp_single_node_ha)
string(TOLOWER ${CMAKE_BUILD_TYPE} lower_build_type)
@ -405,20 +276,6 @@ install(
install(DIRECTORY ${examples}/build/ DESTINATION share/memgraph/examples)
# memgraph distributed main executable
add_executable(memgraph_distributed memgraph_distributed.cpp)
target_link_libraries(memgraph_distributed mg-distributed kvstore_lib telemetry_lib)
set_target_properties(memgraph_distributed PROPERTIES
# Set the executable output name to include version information.
OUTPUT_NAME "memgraph_distributed-${memgraph_VERSION}-${COMMIT_HASH}_${CMAKE_BUILD_TYPE}"
# Output the executable in main binary dir.
# Create symlink to the built executable.
add_custom_command(TARGET memgraph_distributed POST_BUILD
COMMAND ${CMAKE_COMMAND} -E create_symlink $<TARGET_FILE:memgraph_distributed> ${CMAKE_BINARY_DIR}/memgraph_distributed
BYPRODUCTS ${CMAKE_BINARY_DIR}/memgraph_distributed
COMMENT Creating symlink to memgraph distributed executable)
# memgraph single node high availability executable
add_executable(memgraph_ha memgraph_ha.cpp)
target_link_libraries(memgraph_ha mg-single-node-ha kvstore_lib telemetry_lib)

View File

@ -1,114 +0,0 @@
#include <limits>
#include "database/distributed/graph_db.hpp"
#include "storage/distributed/gid.hpp"
#include "utils/flag_validation.hpp"
#include "utils/string.hpp"
// Durability flags.
DEFINE_bool(durability_enabled, false,
"If durability (database persistence) should be enabled");
durability_directory, "durability",
"Path to directory in which to save snapshots and write-ahead log files.");
DEFINE_bool(db_recover_on_startup, false, "Recover database on startup.");
snapshot_cycle_sec, 3600,
"Amount of time between two snapshots, in seconds (min 60).",
FLAG_IN_RANGE(1, std::numeric_limits<int32_t>::max()));
DEFINE_int32(snapshot_max_retained, -1,
"Number of retained snapshots, -1 means without limit.");
DEFINE_bool(snapshot_on_exit, false, "Snapshot on exiting the database.");
// Misc flags
DEFINE_int32(query_execution_time_sec, 180,
"Maximum allowed query execution time. Queries exceeding this "
"limit will be aborted. Value of -1 means no limit.");
DEFINE_int32(gc_cycle_sec, 30,
"Amount of time between starts of two cleaning cycles in seconds. "
"-1 to turn off.");
// Data location.
DEFINE_string(properties_on_disk, "",
"Property names of properties which will be stored on available "
"disk. Property names have to be separated with comma (,).");
// Full durability.
DEFINE_bool(synchronous_commit, false,
"Should a transaction end wait for WAL records to be written to "
"disk before the transaction finishes.");
// Distributed master/worker flags.
DEFINE_VALIDATED_HIDDEN_int32(worker_id, 0,
"ID of a worker in a distributed system. Igored "
"in single-node.",
FLAG_IN_RANGE(0, 1 << gid::kWorkerIdSize));
DEFINE_HIDDEN_string(master_host, "",
"For master node indicates the host served on. For worker "
"node indicates the master location.");
master_port, 0,
"For master node the port on which to serve. For "
"worker node indicates the master's port.",
FLAG_IN_RANGE(0, std::numeric_limits<uint16_t>::max()));
DEFINE_HIDDEN_string(worker_host, "",
"For worker node indicates the host served on. For master "
"node this flag is not used.");
worker_port, 0,
"For master node it's unused. For worker node "
"indicates the port on which to serve. If zero (default value), a port is "
"chosen at random. Sent to the master when registring worker node.",
FLAG_IN_RANGE(0, std::numeric_limits<uint16_t>::max()));
std::max(std::thread::hardware_concurrency(), 1U),
"Number of client workers (RPC)",
std::max(std::thread::hardware_concurrency(), 1U),
"Number of server workers (RPC)",
DEFINE_VALIDATED_int32(recovering_cluster_size, 0,
"Number of workers (including master) in the "
"previously snapshooted/wal cluster.",
// TODO (buda): Implement openCypher query because it completely make sense
// to being able to start and stop DGP on the fly.
// The implementation should be straightforward.
DEFINE_bool(dynamic_graph_partitioner_enabled, false,
"If the dynamic graph partitioner should be enabled.");
DEFINE_VALIDATED_uint64(vertex_cache_size, 5000,
"Size of cache used for storing remote vertices",
FLAG_IN_RANGE(1, std::numeric_limits<uint64_t>::max()));
DEFINE_VALIDATED_uint64(edge_cache_size, 5000,
"Size of cache used for storing remote edges",
FLAG_IN_RANGE(1, std::numeric_limits<uint64_t>::max()));
// Durability flags.
: durability_enabled{FLAGS_durability_enabled},
// Misc flags.
// Data location.
properties_on_disk(utils::Split(FLAGS_properties_on_disk, ",")),
// Distributed flags.
edge_cache_size{FLAGS_edge_cache_size} {}

View File

@ -1,909 +0,0 @@
#include "database/distributed/distributed_graph_db.hpp"
#include "distributed/bfs_rpc_clients.hpp"
#include "distributed/bfs_rpc_server.hpp"
#include "distributed/bfs_subcursor.hpp"
#include "distributed/cluster_discovery_master.hpp"
#include "distributed/cluster_discovery_worker.hpp"
#include "distributed/coordination_master.hpp"
#include "distributed/coordination_worker.hpp"
#include "distributed/data_manager.hpp"
#include "distributed/data_rpc_server.hpp"
#include "distributed/durability_rpc_master.hpp"
#include "distributed/durability_rpc_worker.hpp"
#include "distributed/dynamic_worker.hpp"
#include "distributed/index_rpc_messages.hpp"
#include "distributed/index_rpc_server.hpp"
#include "distributed/plan_consumer.hpp"
#include "distributed/plan_dispatcher.hpp"
#include "distributed/produce_rpc_server.hpp"
#include "distributed/pull_rpc_clients.hpp"
#include "distributed/token_sharing_rpc_server.hpp"
#include "distributed/updates_rpc_clients.hpp"
#include "distributed/updates_rpc_server.hpp"
#include "durability/distributed/snapshooter.hpp"
#include "storage/distributed/concurrent_id_mapper.hpp"
#include "storage/distributed/concurrent_id_mapper_master.hpp"
#include "storage/distributed/concurrent_id_mapper_worker.hpp"
#include "storage/distributed/storage_gc_master.hpp"
#include "storage/distributed/storage_gc_worker.hpp"
#include "transactions/distributed/engine_master.hpp"
#include "transactions/distributed/engine_worker.hpp"
#include "utils/file.hpp"
using namespace std::literals::chrono_literals;
namespace database {
namespace {
// GraphDbAccessor implementations
class MasterAccessor final : public GraphDbAccessor {
distributed::Coordination *coordination_;
distributed::PullRpcClients *pull_clients_;
int worker_id_{0};
MasterAccessor(Master *db, distributed::Coordination *coordination,
distributed::PullRpcClients *pull_clients_)
: GraphDbAccessor(*db),
worker_id_(db->WorkerId()) {}
MasterAccessor(Master *db, tx::TransactionId tx_id,
distributed::Coordination *coordination,
distributed::PullRpcClients *pull_clients_)
: GraphDbAccessor(*db, tx_id),
worker_id_(db->WorkerId()) {}
void PostCreateIndex(const LabelPropertyIndex::Key &key) override {
std::optional<std::vector<utils::Future<bool>>> index_rpc_completions;
// Notify all workers to create the index
[&key](int worker_id, communication::rpc::ClientPool &client_pool) {
try {
return true;
} catch (const communication::rpc::RpcFailedException &) {
return false;
if (index_rpc_completions) {
// Wait first, check later - so that every thread finishes and none
// terminates - this can probably be optimized in case we fail early so
// that we notify other workers to stop building indexes
for (auto &index_built : *index_rpc_completions) index_built.wait();
for (auto &index_built : *index_rpc_completions) {
// TODO: `get()` can throw an exception, should we delete the index when
// it throws?
if (!index_built.get()) {
throw IndexCreationOnWorkerException("Index exists on a worker");
void PopulateIndexFromBuildIndex(
const LabelPropertyIndex::Key &key) override {
// Notify all workers to start populating an index if we are the master
// since they don't have to wait anymore
std::optional<std::vector<utils::Future<bool>>> index_rpc_completions;
worker_id_, [this, &key](int worker_id,
communication::rpc::ClientPool &client_pool) {
try {
key.label_, key.property_, transaction_id());
return true;
} catch (const communication::rpc::RpcFailedException &) {
return false;
// Populate our own storage
// Check if all workers successfully built their indexes and after this we
// can set the index as built
if (index_rpc_completions) {
// Wait first, check later - so that every thread finishes and none
// terminates - this can probably be optimized in case we fail early so
// that we notify other workers to stop building indexes
for (auto &index_built : *index_rpc_completions) index_built.wait();
for (auto &index_built : *index_rpc_completions) {
// TODO: `get()` can throw an exception, should we delete the index when
// it throws?
if (!index_built.get()) {
throw IndexCreationOnWorkerException("Index exists on a worker");
// TODO (mferencevic): Move this logic into the transaction engine.
void AdvanceCommand() override {
auto tx_id = transaction_id();
auto futures = pull_clients_->NotifyAllTransactionCommandAdvanced(tx_id);
for (auto &future : futures) future.get();
class WorkerAccessor final : public GraphDbAccessor {
explicit WorkerAccessor(Worker *db)
: GraphDbAccessor(*db) {}
WorkerAccessor(Worker *db, tx::TransactionId tx_id)
: GraphDbAccessor(*db, tx_id) {}
void BuildIndex(storage::Label, storage::Property) override {
// TODO: Rethink BuildIndex API or inheritance. It's rather strange that a
// derived type blocks this functionality.
LOG(FATAL) << "BuildIndex invoked on worker.";
// RecoveryTransactions implementations
class DistributedRecoveryTransactions
: public durability::RecoveryTransactions {
explicit DistributedRecoveryTransactions(GraphDb *db) : db_(db) {}
void Commit(const tx::TransactionId &tx_id) final {
void Apply(const database::StateDelta &delta) final {
virtual GraphDbAccessor *GetAccessor(const tx::TransactionId &tx_id) = 0;
GraphDb *db_;
std::unordered_map<tx::TransactionId, std::unique_ptr<GraphDbAccessor>>
class MasterRecoveryTransactions final
: public DistributedRecoveryTransactions {
explicit MasterRecoveryTransactions(Master *db)
: DistributedRecoveryTransactions(db) {}
void Begin(const tx::TransactionId &tx_id) final {
CHECK(accessors_.find(tx_id) == accessors_.end())
<< "Double transaction start";
accessors_.emplace(tx_id, db_->Access());
void Abort(const tx::TransactionId &tx_id) final {
virtual GraphDbAccessor *GetAccessor(
const tx::TransactionId &tx_id) override {
auto found = accessors_.find(tx_id);
CHECK(found != accessors_.end())
<< "Accessor does not exist for transaction: " << tx_id;
return found->second.get();
class WorkerRecoveryTransactions final
: public DistributedRecoveryTransactions {
explicit WorkerRecoveryTransactions(Worker *db)
: DistributedRecoveryTransactions(db) {}
void Begin(const tx::TransactionId &tx_id) override {
LOG(FATAL) << "Unexpected transaction begin on worker recovery.";
void Abort(const tx::TransactionId &tx_id) override {
LOG(FATAL) << "Unexpected transaction abort on worker recovery.";
GraphDbAccessor *GetAccessor(const tx::TransactionId &tx_id) override {
auto found = accessors_.find(tx_id);
// Currently accessors are created on transaction_begin, but since workers
// don't have a transaction begin, the accessors are not created.
if (found == accessors_.end()) {
std::tie(found, std::ignore) = accessors_.emplace(tx_id, db_->Access());
return found->second.get();
} // namespace
// GraphDb implementations
namespace impl {
template <template <typename TId> class TMapper>
struct TypemapPack {
template <typename... TMapperArgs>
explicit TypemapPack(TMapperArgs ... args)
: label(args...), edge_type(args...), property(args...) {}
// TODO this should also be garbage collected
TMapper<storage::Label> label;
TMapper<storage::EdgeType> edge_type;
TMapper<storage::Property> property;
// Master
class Master {
explicit Master(const Config &config, database::Master *self)
: config_(config), self_(self) {}
Config config_;
std::unique_ptr<Storage> storage_ =
std::make_unique<Storage>(config_.worker_id, config_.properties_on_disk);
durability::WriteAheadLog wal_{
config_.worker_id, config_.durability_directory,
config_.durability_enabled, config_.synchronous_commit};
// TODO: Some things may depend on order of construction/destruction. We also
// have a lot of circular pointers among members. It would be a good idea to
// clean the mess. Also, be careful of virtual calls to `self_` in
// constructors of members.
database::Master *self_{nullptr};
distributed::MasterCoordination coordination_{config_.master_endpoint,
tx::EngineMaster tx_engine_{&coordination_, &wal_};
std::unique_ptr<StorageGcMaster> storage_gc_ =
std::make_unique<StorageGcMaster>(storage_.get(), &tx_engine_,
config_.gc_cycle_sec, &coordination_);
TypemapPack<storage::MasterConcurrentIdMapper> typemap_pack_{&coordination_};
distributed::BfsSubcursorStorage subcursor_storage_{&bfs_subcursor_clients_};
distributed::BfsRpcServer bfs_subcursor_server_{self_, &coordination_,
distributed::BfsRpcClients bfs_subcursor_clients_{
self_, &subcursor_storage_, &coordination_, &data_manager_};
distributed::DurabilityRpcMaster durability_rpc_{&coordination_};
distributed::DataRpcServer data_server_{self_, &coordination_};
distributed::DataRpcClients data_clients_{&coordination_};
distributed::PlanDispatcher plan_dispatcher_{&coordination_};
distributed::PullRpcClients pull_clients_{&coordination_, &data_manager_};
distributed::UpdatesRpcServer updates_server_{self_, &coordination_};
distributed::UpdatesRpcClients updates_clients_{&coordination_};
distributed::DataManager data_manager_{*self_, data_clients_,
distributed::ClusterDiscoveryMaster cluster_discovery_{
&coordination_, config_.durability_directory};
distributed::TokenSharingRpcServer token_sharing_server_{
self_, config_.worker_id, &coordination_};
distributed::DynamicWorkerAddition dynamic_worker_addition_{self_, &coordination_};
} // namespace impl
Master::Master(Config config)
: impl_(std::make_unique<impl::Master>(config, this)) {
// Register all transaction based caches for cleanup.
Master::~Master() {}
std::unique_ptr<GraphDbAccessor> Master::Access() {
return std::make_unique<MasterAccessor>(
this, &impl_->coordination_, &impl_->pull_clients_);
std::unique_ptr<GraphDbAccessor> Master::Access(tx::TransactionId tx_id) {
return std::make_unique<MasterAccessor>(
this, tx_id, &impl_->coordination_, &impl_->pull_clients_);
Storage &Master::storage() { return *impl_->storage_; }
durability::WriteAheadLog &Master::wal() { return impl_->wal_; }
tx::Engine &Master::tx_engine() { return impl_->tx_engine_; }
storage::ConcurrentIdMapper<storage::Label> &Master::label_mapper() {
return impl_->typemap_pack_.label;
storage::ConcurrentIdMapper<storage::EdgeType> &Master::edge_type_mapper() {
return impl_->typemap_pack_.edge_type;
storage::ConcurrentIdMapper<storage::Property> &Master::property_mapper() {
return impl_->;
void Master::CollectGarbage() { impl_->storage_gc_->CollectGarbage(); }
int Master::WorkerId() const { return impl_->config_.worker_id; }
std::vector<int> Master::GetWorkerIds() const {
return impl_->coordination_.GetWorkerIds();
// Makes a local snapshot and forces the workers to do the same. Snapshot is
// written here only if workers sucesfully created their own snapshot
bool Master::MakeSnapshot(GraphDbAccessor &accessor) {
auto workers_snapshot =
if (!workers_snapshot.get()) return false;
// This can be further optimized by creating master snapshot at the same
// time as workers snapshots but this forces us to delete the master
// snapshot if we succeed in creating it and workers somehow fail. Because
// we have an assumption that every snapshot that exists on master with
// some tx_id visibility also exists on workers
const bool status =
durability::MakeSnapshot(*this, accessor, impl_->config_.worker_id,
if (status) {
LOG(INFO) << "Snapshot created successfully.";
} else {
LOG(ERROR) << "Snapshot creation failed!";
return status;
void Master::ReinitializeStorage() {
impl_->storage_ = std::make_unique<Storage>(
impl_->config_.worker_id, impl_->config_.properties_on_disk);
impl_->storage_gc_->Reinitialize(impl_->storage_.get(), &impl_->tx_engine_);
io::network::Endpoint Master::endpoint() const {
return impl_->coordination_.GetServerEndpoint();
io::network::Endpoint Master::GetEndpoint(int worker_id) {
return impl_->coordination_.GetEndpoint(worker_id);
void Master::Start() {
// Start coordination.
CHECK(impl_->coordination_.Start()) << "Couldn't start master coordination!";
// Start transactional cache cleanup.
if (impl_->config_.durability_enabled)
// Durability recovery.
// What we recover.
std::optional<durability::RecoveryInfo> recovery_info;
durability::RecoveryData recovery_data;
// Recover only if necessary.
if (impl_->config_.db_recover_on_startup) {
<< "Contents of durability directory are not compatible with the "
"current version of Memgraph binary!";
recovery_info = durability::RecoverOnlySnapshot(
impl_->config_.durability_directory, this, &recovery_data,
std::nullopt, impl_->config_.worker_id);
// Post-recovery setup and checking.
recovery_info ? std::make_optional(
: std::nullopt);
// Wait till workers report back their recoverable wal txs
if (recovery_info) {
CHECK(impl_->config_.recovering_cluster_size > 0)
<< "Invalid cluster recovery size flag. Recovered cluster size "
"should be at least 1";
while (impl_->coordination_.CountRecoveredWorkers() !=
impl_->config_.recovering_cluster_size - 1) {
LOG(INFO) << "Waiting for workers to finish recovering..";
// Get the intersection of recoverable transactions from wal on
// workers and on master
recovery_data.wal_tx_to_recover =
MasterRecoveryTransactions recovery_transactions(this);
durability::RecoverWal(impl_->config_.durability_directory, this,
&recovery_data, &recovery_transactions);
durability::RecoverIndexes(this, recovery_data.indexes);
auto workers_recovered_wal =
// Start the dynamic graph partitioner inside token sharing server
if (impl_->config_.dynamic_graph_partitioner_enabled) {
if (impl_->config_.durability_enabled) {
// move any existing snapshots or wal files to a deprecated folder.
if (!impl_->config_.db_recover_on_startup &&
impl_->config_.durability_directory)) {
LOG(WARNING) << "Since Memgraph was not supposed to recover on startup "
"and durability is enabled, your current durability "
"files will likely be overriden. To prevent important "
"data loss, Memgraph has stored those files into a "
".backup directory inside durability directory";
snapshot_creator_ = std::make_unique<utils::Scheduler>();
"Snapshot", std::chrono::seconds(impl_->config_.snapshot_cycle_sec),
[this] {
auto dba = this->Access();
// Start transaction killer.
if (impl_->config_.query_execution_time_sec != -1) {
"TX killer",
1, std::min(5, impl_->config_.query_execution_time_sec / 4))),
[this]() {
[this](tx::Transaction &t) {
if (t.creation_time() +
impl_->config_.query_execution_time_sec) <
std::chrono::steady_clock::now()) {
bool Master::AwaitShutdown(std::function<void(void)> call_before_shutdown) {
bool ret =
[this, &call_before_shutdown](bool is_cluster_alive) -> bool {
snapshot_creator_ = nullptr;
// Stop all running transactions. This will allow all shutdowns in
// the callback that depend on query execution to be aborted and
// cleaned up.
// TODO (mferencevic): When we have full cluster management
// (detection of failure and automatic failure recovery) this should
// this be done directly through the transaction engine (eg. using
// cluster degraded/operational hooks and callbacks).
is_accepting_transactions_ = false;
[](auto &t) { t.set_should_abort(); });
// Call the toplevel callback to stop everything that the caller
// wants us to stop.
// Now we stop everything that calls RPCs (garbage collection, etc.)
// Stop the storage garbage collector.
// Transactional cache cleanup must be stopped before all of the
// objects that were registered for cleanup are destructed.
// We are not a worker, so we can do a snapshot on exit if it's
// enabled. Doing this on the master forces workers to do the same
// through RPCs. If the cluster is in a degraded state then don't
// attempt to do a snapshot because the snapshot can't be created on
// all workers. The cluster will have to recover from a previous
// snapshot and WALs.
if (impl_->config_.snapshot_on_exit) {
if (is_cluster_alive) {
auto dba = Access();
// Here we make the snapshot and return the snapshot creation
// success to the caller.
return MakeSnapshot(*dba);
} else {
<< "Because the cluster is in a degraded state we can't "
"create a snapshot. The cluster will be recovered from "
"previous snapshots and WALs.";
// The shutdown was completed successfully.
return true;
// Return the shutdown success status.
return ret;
void Master::Shutdown() { return impl_->coordination_.Shutdown(); }
distributed::BfsRpcClients &Master::bfs_subcursor_clients() {
return impl_->bfs_subcursor_clients_;
distributed::DataRpcClients &Master::data_clients() {
return impl_->data_clients_;
distributed::UpdatesRpcServer &Master::updates_server() {
return impl_->updates_server_;
distributed::UpdatesRpcClients &Master::updates_clients() {
return impl_->updates_clients_;
distributed::DataManager &Master::data_manager() {
return impl_->data_manager_;
distributed::PullRpcClients &Master::pull_clients() {
return impl_->pull_clients_;
distributed::PlanDispatcher &Master::plan_dispatcher() {
return impl_->plan_dispatcher_;
VertexAccessor InsertVertexIntoRemote(
GraphDbAccessor *dba, int worker_id,
const std::vector<storage::Label> &labels,
const std::unordered_map<storage::Property, PropertyValue> &properties,
std::optional<int64_t> cypher_id) {
auto *db = &dba->db();
CHECK(worker_id != db->WorkerId())
<< "Not allowed to call InsertVertexIntoRemote for local worker";
auto *updates_clients = &db->updates_clients();
auto *data_manager = &db->data_manager();
CHECK(updates_clients && data_manager);
auto created_vertex_info = updates_clients->CreateVertex(
worker_id, dba->transaction_id(), labels, properties, cypher_id);
auto vertex = std::make_unique<Vertex>();
vertex->labels_ = labels;
for (auto &kv : properties) vertex->properties_.set(kv.first, kv.second);
dba->transaction_id(), created_vertex_info.gid,
nullptr, std::move(vertex)));
return VertexAccessor({created_vertex_info.gid, worker_id}, *dba);
// Worker
namespace impl {
class Worker {
Config config_;
std::unique_ptr<Storage> storage_ =
std::make_unique<Storage>(config_.worker_id, config_.properties_on_disk);
durability::WriteAheadLog wal_{
config_.worker_id, config_.durability_directory,
config_.durability_enabled, config_.synchronous_commit};
Worker(const Config &config, database::Worker *self)
: config_(config), self_(self) {}
// TODO: Some things may depend on order of construction/destruction. We also
// have a lot of circular pointers among members. It would be a good idea to
// clean the mess. Also, be careful of virtual calls to `self_` in
// constructors of members.
database::Worker *self_{nullptr};
distributed::WorkerCoordination coordination_{
config_.worker_endpoint, config_.worker_id, config_.master_endpoint,
config_.rpc_num_server_workers, config_.rpc_num_client_workers};
tx::EngineWorker tx_engine_{&coordination_, &wal_};
std::unique_ptr<StorageGcWorker> storage_gc_ =
storage_.get(), &tx_engine_, config_.gc_cycle_sec,
coordination_.GetClientPool(0), config_.worker_id);
TypemapPack<storage::WorkerConcurrentIdMapper> typemap_pack_{
distributed::BfsSubcursorStorage subcursor_storage_{&bfs_subcursor_clients_};
distributed::BfsRpcServer bfs_subcursor_server_{self_, &coordination_,
distributed::BfsRpcClients bfs_subcursor_clients_{
self_, &subcursor_storage_, &coordination_, &data_manager_};
distributed::DataRpcServer data_server_{self_, &coordination_};
distributed::DataRpcClients data_clients_{&coordination_};
distributed::PlanConsumer plan_consumer_{&coordination_};
distributed::ProduceRpcServer produce_server_{self_, &tx_engine_, &coordination_,
plan_consumer_, &data_manager_};
distributed::IndexRpcServer index_rpc_server_{self_, &coordination_};
distributed::UpdatesRpcServer updates_server_{self_, &coordination_};
distributed::UpdatesRpcClients updates_clients_{&coordination_};
distributed::DataManager data_manager_{*self_, data_clients_,
distributed::DurabilityRpcWorker durability_rpc_{self_, &coordination_};
distributed::ClusterDiscoveryWorker cluster_discovery_{
distributed::TokenSharingRpcServer token_sharing_server_{
self_, config_.worker_id, &coordination_};
distributed::DynamicWorkerRegistration dynamic_worker_registration_{
} // namespace impl
Worker::Worker(Config config)
: impl_(std::make_unique<impl::Worker>(config, this)) {
// Register all transaction based caches for cleanup.
Worker::~Worker() {}
std::unique_ptr<GraphDbAccessor> Worker::Access() {
return std::make_unique<WorkerAccessor>(this);
std::unique_ptr<GraphDbAccessor> Worker::Access(tx::TransactionId tx_id) {
return std::make_unique<WorkerAccessor>(this, tx_id);
Storage &Worker::storage() { return *impl_->storage_; }
durability::WriteAheadLog &Worker::wal() { return impl_->wal_; }
tx::Engine &Worker::tx_engine() { return impl_->tx_engine_; }
storage::ConcurrentIdMapper<storage::Label> &Worker::label_mapper() {
return impl_->typemap_pack_.label;
storage::ConcurrentIdMapper<storage::EdgeType> &Worker::edge_type_mapper() {
return impl_->typemap_pack_.edge_type;
storage::ConcurrentIdMapper<storage::Property> &Worker::property_mapper() {
return impl_->;
void Worker::CollectGarbage() { return impl_->storage_gc_->CollectGarbage(); }
int Worker::WorkerId() const { return impl_->config_.worker_id; }
std::vector<int> Worker::GetWorkerIds() const {
return impl_->coordination_.GetWorkerIds();
bool Worker::MakeSnapshot(GraphDbAccessor &accessor) {
// Makes a local snapshot from the visibility of accessor
const bool status =
durability::MakeSnapshot(*this, accessor, impl_->config_.worker_id,
if (status) {
LOG(INFO) << "Snapshot created successfully.";
} else {
LOG(ERROR) << "Snapshot creation failed!";
return status;
void Worker::ReinitializeStorage() {
impl_->storage_ = std::make_unique<Storage>(
impl_->config_.worker_id, impl_->config_.properties_on_disk);
impl_->storage_gc_->Reinitialize(impl_->storage_.get(), &impl_->tx_engine_);
void Worker::RecoverWalAndIndexes(durability::RecoveryData *recovery_data) {
WorkerRecoveryTransactions recovery_transactions(this);
durability::RecoverWal(impl_->config_.durability_directory, this,
recovery_data, &recovery_transactions);
durability::RecoverIndexes(this, recovery_data->indexes);
io::network::Endpoint Worker::endpoint() const {
return impl_->coordination_.GetServerEndpoint();
io::network::Endpoint Worker::GetEndpoint(int worker_id) {
return impl_->coordination_.GetEndpoint(worker_id);
void Worker::Start() {
// Start coordination.
CHECK(impl_->coordination_.Start()) << "Couldn't start worker coordination!";
// Register to the master.
// Start transactional cache cleanup.
if (impl_->config_.durability_enabled)
// Durability recovery. We need to check this flag for workers that are added
// after the "main" cluster recovery.
if (impl_->config_.db_recover_on_startup) {
// What we should recover (version, transaction_id) pair.
auto snapshot_to_recover = impl_->cluster_discovery_.snapshot_to_recover();
// What we recover.
std::optional<durability::RecoveryInfo> recovery_info;
durability::RecoveryData recovery_data;
// Recover only if necessary.
if (snapshot_to_recover) {
// check version consistency.
if (!durability::DistributedVersionConsistency(
LOG(FATAL) << "Memgraph worker failed to recover due to version "
"inconsistency with the master.";
if (!durability::VersionConsistency(impl_->config_.durability_directory))
<< "Contents of durability directory are not compatible with the "
"current version of Memgraph binary!";
recovery_info = durability::RecoverOnlySnapshot(
impl_->config_.durability_directory, this, &recovery_data,
snapshot_to_recover->second, impl_->config_.worker_id);
// Post-recovery setup and checking.
if (snapshot_to_recover &&
(!recovery_info ||
snapshot_to_recover->second != recovery_info->snapshot_tx_id))
LOG(FATAL) << "Memgraph worker failed to recover the database state "
"recovered on the master";
} else {
// Check with master if we're a dynamically added worker and need to update
// our indices.
auto indexes = impl_->dynamic_worker_registration_.GetIndicesToCreate();
if (!indexes.empty()) {
durability::RecoverIndexes(this, indexes);
if (impl_->config_.durability_enabled) {
// move any existing snapshots or wal files to a deprecated folder.
if (!impl_->config_.db_recover_on_startup &&
impl_->config_.durability_directory)) {
LOG(WARNING) << "Since Memgraph was not supposed to recover on startup "
"and durability is enabled, your current durability "
"files will likely be overriden. To prevent important "
"data loss, Memgraph has stored those files into a "
".backup directory inside durability directory";
// Start transaction killer.
if (impl_->config_.query_execution_time_sec != -1) {
"TX killer",
1, std::min(5, impl_->config_.query_execution_time_sec / 4))),
[this]() {
[this](tx::Transaction &t) {
if (t.creation_time() +
impl_->config_.query_execution_time_sec) <
std::chrono::steady_clock::now()) {
bool Worker::AwaitShutdown(std::function<void(void)> call_before_shutdown) {
bool ret = impl_->coordination_.AwaitShutdown(
[this, &call_before_shutdown](bool is_cluster_alive) -> bool {
// Stop all running transactions. This will allow all shutdowns in the
// callback that depend on query execution to be aborted and cleaned up.
// TODO (mferencevic): See the note for this same code for the `Master`.
is_accepting_transactions_ = false;
[](auto &t) { t.set_should_abort(); });
// Call the toplevel callback to stop everything that the caller wants
// us to stop.
// Now we stop everything that calls RPCs (garbage collection, etc.)
// Stop the storage garbage collector.
// Transactional cache cleanup must be stopped before all of the objects
// that were registered for cleanup are destructed.
// The worker shutdown always succeeds.
return true;
// Return the shutdown success status.
return ret;
void Worker::Shutdown() { return impl_->coordination_.Shutdown(); }
distributed::BfsRpcClients &Worker::bfs_subcursor_clients() {
return impl_->bfs_subcursor_clients_;
distributed::DataRpcClients &Worker::data_clients() {
return impl_->data_clients_;
distributed::UpdatesRpcServer &Worker::updates_server() {
return impl_->updates_server_;
distributed::UpdatesRpcClients &Worker::updates_clients() {
return impl_->updates_clients_;
distributed::DataManager &Worker::data_manager() {
return impl_->data_manager_;
distributed::PlanConsumer &Worker::plan_consumer() {
return impl_->plan_consumer_;
} // namespace database

View File

@ -1,109 +0,0 @@
/// @file
#pragma once
#include "database/distributed/graph_db.hpp"
#include "durability/distributed/version.hpp"
namespace database {
class Master final : public GraphDb {
explicit Master(Config config = Config());
std::unique_ptr<GraphDbAccessor> Access() override;
std::unique_ptr<GraphDbAccessor> Access(tx::TransactionId) override;
Storage &storage() override;
durability::WriteAheadLog &wal() override;
tx::Engine &tx_engine() override;
storage::ConcurrentIdMapper<storage::Label> &label_mapper() override;
storage::ConcurrentIdMapper<storage::EdgeType> &edge_type_mapper() override;
storage::ConcurrentIdMapper<storage::Property> &property_mapper() override;
void CollectGarbage() override;
int WorkerId() const override;
std::vector<int> GetWorkerIds() const override;
bool MakeSnapshot(GraphDbAccessor &accessor) override;
void ReinitializeStorage() override;
/** Gets this master's endpoint. */
io::network::Endpoint endpoint() const;
/** Gets the endpoint of the worker with the given id. */
// TODO make const once Coordination::GetEndpoint is const.
io::network::Endpoint GetEndpoint(int worker_id);
void Start();
bool AwaitShutdown(std::function<void(void)> call_before_shutdown = [] {});
void Shutdown();
distributed::BfsRpcClients &bfs_subcursor_clients() override;
distributed::DataRpcClients &data_clients() override;
distributed::UpdatesRpcServer &updates_server() override;
distributed::UpdatesRpcClients &updates_clients() override;
distributed::DataManager &data_manager() override;
distributed::PullRpcClients &pull_clients();
distributed::PlanDispatcher &plan_dispatcher();
distributed::IndexRpcClients &index_rpc_clients();
std::unique_ptr<impl::Master> impl_;
utils::Scheduler transaction_killer_;
std::unique_ptr<utils::Scheduler> snapshot_creator_;
class Worker final : public GraphDb {
explicit Worker(Config config = Config());
std::unique_ptr<GraphDbAccessor> Access() override;
std::unique_ptr<GraphDbAccessor> Access(tx::TransactionId) override;
Storage &storage() override;
durability::WriteAheadLog &wal() override;
tx::Engine &tx_engine() override;
storage::ConcurrentIdMapper<storage::Label> &label_mapper() override;
storage::ConcurrentIdMapper<storage::EdgeType> &edge_type_mapper() override;
storage::ConcurrentIdMapper<storage::Property> &property_mapper() override;
void CollectGarbage() override;
int WorkerId() const override;
std::vector<int> GetWorkerIds() const override;
bool MakeSnapshot(GraphDbAccessor &accessor) override;
void ReinitializeStorage() override;
void RecoverWalAndIndexes(durability::RecoveryData *recovery_data);
/** Gets this worker's endpoint. */
io::network::Endpoint endpoint() const;
/** Gets the endpoint of the worker with the given id. */
// TODO make const once Coordination::GetEndpoint is const.
io::network::Endpoint GetEndpoint(int worker_id);
void Start();
bool AwaitShutdown(std::function<void(void)> call_before_shutdown = [] {});
void Shutdown();
distributed::BfsRpcClients &bfs_subcursor_clients() override;
distributed::DataRpcClients &data_clients() override;
distributed::UpdatesRpcServer &updates_server() override;
distributed::UpdatesRpcClients &updates_clients() override;
distributed::DataManager &data_manager() override;
distributed::PlanConsumer &plan_consumer();
std::unique_ptr<impl::Worker> impl_;
utils::Scheduler transaction_killer_;
/// Creates a new Vertex on the given worker.
/// It is NOT allowed to call this function with this worker's id.
VertexAccessor InsertVertexIntoRemote(
GraphDbAccessor *dba, int worker_id,
const std::vector<storage::Label> &labels,
const std::unordered_map<storage::Property, PropertyValue> &properties,
std::optional<int64_t> cypher_id);
} // namespace database

View File

@ -1,146 +0,0 @@
/// @file
#pragma once
#include <atomic>
#include <memory>
#include <vector>
#include "durability/distributed/recovery.hpp"
#include "durability/distributed/wal.hpp"
#include "io/network/endpoint.hpp"
#include "storage/common/types/types.hpp"
#include "storage/distributed/concurrent_id_mapper.hpp"
#include "storage/distributed/storage.hpp"
#include "storage/distributed/storage_gc.hpp"
#include "storage/distributed/vertex_accessor.hpp"
#include "transactions/distributed/engine.hpp"
#include "utils/scheduler.hpp"
namespace distributed {
class BfsRpcServer;
class BfsRpcClients;
class DataRpcServer;
class DataRpcClients;
class PlanDispatcher;
class PlanConsumer;
class PullRpcClients;
class ProduceRpcServer;
class UpdatesRpcServer;
class UpdatesRpcClients;
class DataManager;
class IndexRpcClients;
} // namespace distributed
namespace database {
namespace impl {
class Master;
class Worker;
} // namespace impl
/// Database configuration. Initialized from flags, but modifiable.
struct Config {
// Durability flags.
bool durability_enabled;
std::string durability_directory;
bool db_recover_on_startup;
int snapshot_cycle_sec;
int snapshot_max_retained;
int snapshot_on_exit;
bool synchronous_commit;
// Misc flags.
int gc_cycle_sec;
int query_execution_time_sec;
// set of properties which will be stored on disk
std::vector<std::string> properties_on_disk;
// Distributed master/worker flags.
bool dynamic_graph_partitioner_enabled{false};
int rpc_num_client_workers{0};
int rpc_num_server_workers{0};
int worker_id{0};
io::network::Endpoint master_endpoint{"", 0};
io::network::Endpoint worker_endpoint{"", 0};
int recovering_cluster_size{0};
// Sizes of caches that hold remote data
// Default value is same as in config.cpp
size_t vertex_cache_size{5000};
size_t edge_cache_size{5000};
class GraphDbAccessor;
/// An abstract base class providing the interface for a graph database.
/// Always be sure that GraphDb object is destructed before main exits, i. e.
/// GraphDb object shouldn't be part of global/static variable, except if its
/// destructor is explicitly called before main exits. Consider code:
/// GraphDb db; // KeyIndex is created as a part of database::Storage
/// int main() {
/// GraphDbAccessor dba(db);
/// auto v = dba.InsertVertex();
/// v.add_label(dba.Label(
/// "Start")); // New SkipList is created in KeyIndex for LabelIndex.
/// // That SkipList creates SkipListGc which
/// // initialises static Executor object.
/// return 0;
/// }
/// After main exits: 1. Executor is destructed, 2. KeyIndex is destructed.
/// Destructor of KeyIndex calls delete on created SkipLists which destroy
/// SkipListGc that tries to use Excutioner object that doesn't exist anymore.
/// -> CRASH
class GraphDb {
GraphDb() {}
GraphDb(const GraphDb &) = delete;
GraphDb(GraphDb &&) = delete;
GraphDb &operator=(const GraphDb &) = delete;
GraphDb &operator=(GraphDb &&) = delete;
virtual ~GraphDb() {}
/// Create a new accessor by starting a new transaction.
virtual std::unique_ptr<GraphDbAccessor> Access() = 0;
/// Create an accessor for a running transaction.
virtual std::unique_ptr<GraphDbAccessor> Access(tx::TransactionId) = 0;
virtual Storage &storage() = 0;
virtual durability::WriteAheadLog &wal() = 0;
virtual tx::Engine &tx_engine() = 0;
virtual storage::ConcurrentIdMapper<storage::Label> &label_mapper() = 0;
virtual storage::ConcurrentIdMapper<storage::EdgeType>
&edge_type_mapper() = 0;
virtual storage::ConcurrentIdMapper<storage::Property> &property_mapper() = 0;
virtual void CollectGarbage() = 0;
/// Makes a snapshot from the visibility of the given accessor
virtual bool MakeSnapshot(GraphDbAccessor &accessor) = 0;
/// Releases the storage object safely and creates a new object.
/// This is needed because of recovery, otherwise we might try to recover into
/// a storage which has already been polluted because of a failed previous
/// recovery
virtual void ReinitializeStorage() = 0;
virtual int WorkerId() const = 0;
virtual std::vector<int> GetWorkerIds() const = 0;
virtual distributed::BfsRpcClients &bfs_subcursor_clients() = 0;
virtual distributed::DataRpcClients &data_clients() = 0;
virtual distributed::UpdatesRpcServer &updates_server() = 0;
virtual distributed::UpdatesRpcClients &updates_clients() = 0;
virtual distributed::DataManager &data_manager() = 0;
/// When this is false, no new transactions should be created.
bool is_accepting_transactions() const { return is_accepting_transactions_; }
std::atomic<bool> is_accepting_transactions_{true};
} // namespace database

View File

@ -1,560 +0,0 @@
#include "database/distributed/graph_db_accessor.hpp"
#include <chrono>
#include <thread>
#include <glog/logging.h>
#include "distributed/data_manager.hpp"
#include "distributed/updates_rpc_clients.hpp"
#include "durability/distributed/state_delta.hpp"
#include "storage/distributed/address_types.hpp"
#include "storage/distributed/edge.hpp"
#include "storage/distributed/edge_accessor.hpp"
#include "storage/distributed/vertex.hpp"
#include "storage/distributed/vertex_accessor.hpp"
#include "utils/cast.hpp"
#include "utils/on_scope_exit.hpp"
namespace database {
GraphDbAccessor::GraphDbAccessor(GraphDb &db)
: db_(db),
transaction_starter_{true} {}
GraphDbAccessor::GraphDbAccessor(GraphDb &db, tx::TransactionId tx_id)
: db_(db),
transaction_starter_{false} {}
GraphDbAccessor::~GraphDbAccessor() {
if (transaction_starter_ && !commited_ && !aborted_) {
int16_t GraphDbAccessor::worker_id() const {
return db_.WorkerId();
distributed::DataManager &GraphDbAccessor::data_manager() {
return db_.data_manager();
distributed::UpdatesRpcClients &GraphDbAccessor::updates_clients() {
return db_.updates_clients();
tx::TransactionId GraphDbAccessor::transaction_id() const {
return transaction_.id_;
void GraphDbAccessor::AdvanceCommand() {
DCHECK(!commited_ && !aborted_) << "Accessor committed or aborted";
void GraphDbAccessor::Commit() {
DCHECK(!commited_ && !aborted_) << "Already aborted or commited transaction.";
commited_ = true;
void GraphDbAccessor::Abort() {
DCHECK(!commited_ && !aborted_) << "Already aborted or commited transaction.";
aborted_ = true;
bool GraphDbAccessor::should_abort() const {
DCHECK(!commited_ && !aborted_) << "Accessor committed or aborted";
return transaction_.should_abort();
durability::WriteAheadLog &GraphDbAccessor::wal() { return db_.wal(); }
VertexAccessor GraphDbAccessor::InsertVertex(
std::optional<gid::Gid> requested_gid, std::optional<int64_t> cypher_id) {
DCHECK(!commited_ && !aborted_) << "Accessor committed or aborted";
auto gid =;
if (!cypher_id) cypher_id = utils::MemcpyCast<int64_t>(gid);
auto vertex_vlist =
new mvcc::VersionList<Vertex>(transaction_, gid, *cypher_id);
bool success =, vertex_vlist).second;
CHECK(success) << "Attempting to insert a vertex with an existing GID: "
<< gid;
transaction_.id_, vertex_vlist->gid_, vertex_vlist->cypher_id()));
auto va = VertexAccessor(storage::VertexAddress(vertex_vlist), *this);
return va;
std::optional<VertexAccessor> GraphDbAccessor::FindVertexOptional(
gid::Gid gid, bool current_state) {
auto record_accessor = FindVertexRaw(gid);
if (!record_accessor.Visible(transaction(), current_state))
return std::nullopt;
return record_accessor;
VertexAccessor GraphDbAccessor::FindVertexRaw(gid::Gid gid) {
return VertexAccessor(
storage::VertexAddress(<Vertex>(gid)), *this);
VertexAccessor GraphDbAccessor::FindVertex(gid::Gid gid, bool current_state) {
auto found = FindVertexOptional(gid, current_state);
CHECK(found) << "Unable to find vertex for id: " << gid;
return *found;
std::optional<EdgeAccessor> GraphDbAccessor::FindEdgeOptional(
gid::Gid gid, bool current_state) {
auto record_accessor = FindEdgeRaw(gid);
if (!record_accessor.Visible(transaction(), current_state))
return std::nullopt;
return record_accessor;
EdgeAccessor GraphDbAccessor::FindEdgeRaw(gid::Gid gid) {
return EdgeAccessor(
storage::EdgeAddress(<Edge>(gid)), *this);
EdgeAccessor GraphDbAccessor::FindEdge(gid::Gid gid, bool current_state) {
auto found = FindEdgeOptional(gid, current_state);
CHECK(found) << "Unable to find edge for id: " << gid;
return *found;
void GraphDbAccessor::BuildIndex(storage::Label label,
storage::Property property) {
DCHECK(!commited_ && !aborted_) << "Accessor committed or aborted";;
// on function exit remove the create index transaction from
// build_tx_in_progress
utils::OnScopeExit on_exit_1([this] {
auto removed =
DCHECK(removed) << "Index creation transaction should be inside set";
// Create the index
const LabelPropertyIndex::Key key(label, property);
if ( == false) {
throw IndexExistsException(
"Index is either being created by another transaction or already "
// Call the hook for inherited classes.
// Everything that happens after the line above ended will be added to the
// index automatically, but we still have to add to index everything that
// happened earlier. We have to first wait for every transaction that
// happend before, or a bit later than CreateIndex to end.
auto wait_transactions = transaction_.engine_.GlobalActiveTransactions();
auto active_index_creation_transactions =;
for (auto id : wait_transactions) {
if (active_index_creation_transactions.contains(id)) continue;
while (transaction_.engine_.Info(id).is_active()) {
// Active index creation set could only now start containing that id,
// since that thread could have not written to the set set and to avoid
// dead-lock we need to make sure we keep track of that
if (active_index_creation_transactions.contains(id)) continue;
// TODO reconsider this constant, currently rule-of-thumb chosen
// This accessor's transaction surely sees everything that happened before
// CreateIndex.
auto dba = db_.Access();
// Add transaction to the build_tx_in_progress as this transaction doesn't
// change data and shouldn't block other parallel index creations
auto read_transaction_id = dba->transaction().id_;
// on function exit remove the read transaction from build_tx_in_progress
utils::OnScopeExit on_exit_2([read_transaction_id, this] {
auto removed =
DCHECK(removed) << "Index building (read) transaction should be inside set";
void GraphDbAccessor::DeleteIndex(storage::Label, storage::Property) {
throw utils::NotYetImplemented("Distributed drop index");
void GraphDbAccessor::EnableIndex(const LabelPropertyIndex::Key &key) {
// Commit transaction as we finished applying method on newest visible
// records. Write that transaction's ID to the WAL as the index has been
// built at this point even if this DBA's transaction aborts for some
// reason.
auto wal_build_index_tx_id = transaction_id();
wal_build_index_tx_id, key.label_, LabelName(key.label_), key.property_,
// After these two operations we are certain that everything is contained in
// the index under the assumption that the original index creation transaction
// contained no vertex/edge insert/update before this method was invoked.;
void GraphDbAccessor::PopulateIndex(const LabelPropertyIndex::Key &key) {
for (auto vertex : Vertices(key.label_, false)) {
if (vertex.PropsAt(key.property_).type() == PropertyValue::Type::Null)
vertex.address().local(), vertex.GetCurrent());
void GraphDbAccessor::UpdateLabelIndices(storage::Label label,
const VertexAccessor &vertex_accessor,
const Vertex *const vertex) {
DCHECK(!commited_ && !aborted_) << "Accessor committed or aborted";
DCHECK(vertex_accessor.is_local()) << "Only local vertices belong in indexes";
auto *vlist_ptr = vertex_accessor.address().local();, vlist_ptr, vertex);, vlist_ptr, vertex);
void GraphDbAccessor::UpdatePropertyIndex(
storage::Property property, const RecordAccessor<Vertex> &vertex_accessor,
const Vertex *const vertex) {
DCHECK(!commited_ && !aborted_) << "Accessor committed or aborted";
DCHECK(vertex_accessor.is_local()) << "Only local vertices belong in indexes";
property, vertex_accessor.address().local(), vertex);
int64_t GraphDbAccessor::VerticesCount() const {
DCHECK(!commited_ && !aborted_) << "Accessor committed or aborted";
int64_t GraphDbAccessor::VerticesCount(storage::Label label) const {
DCHECK(!commited_ && !aborted_) << "Accessor committed or aborted";
int64_t GraphDbAccessor::VerticesCount(storage::Label label,
storage::Property property) const {
DCHECK(!commited_ && !aborted_) << "Accessor committed or aborted";
const LabelPropertyIndex::Key key(label, property);
<< "Index doesn't exist.";
int64_t GraphDbAccessor::VerticesCount(storage::Label label,
storage::Property property,
const PropertyValue &value) const {
DCHECK(!commited_ && !aborted_) << "Accessor committed or aborted";
const LabelPropertyIndex::Key key(label, property);
<< "Index doesn't exist.";
.label_property_index_.PositionAndCount(key, value)
int64_t GraphDbAccessor::VerticesCount(
storage::Label label, storage::Property property,
const std::optional<utils::Bound<PropertyValue>> lower,
const std::optional<utils::Bound<PropertyValue>> upper) const {
DCHECK(!commited_ && !aborted_) << "Accessor committed or aborted";
const LabelPropertyIndex::Key key(label, property);
<< "Index doesn't exist.";
CHECK(lower || upper) << "At least one bound must be provided";
CHECK(!lower || lower.value().value().type() != PropertyValue::Type::Null)
<< "Null value is not a valid index bound";
CHECK(!upper || upper.value().value().type() != PropertyValue::Type::Null)
<< "Null value is not a valid index bound";
if (!upper) {
auto lower_pac =
key, lower.value().value());
int64_t size =;
return std::max(0l,
size - lower_pac.first -
(lower.value().IsInclusive() ? 0l : lower_pac.second));
} else if (!lower) {
auto upper_pac =
key, upper.value().value());
return upper.value().IsInclusive() ? upper_pac.first + upper_pac.second
: upper_pac.first;
} else {
auto lower_pac =
key, lower.value().value());
auto upper_pac =
key, upper.value().value());
auto result = upper_pac.first - lower_pac.first;
if (lower.value().IsExclusive()) result -= lower_pac.second;
if (upper.value().IsInclusive()) result += upper_pac.second;
return std::max(0l, result);
bool GraphDbAccessor::RemoveVertex(VertexAccessor &vertex_accessor,
bool check_empty) {
if (vertex_accessor.is_local()) {
DCHECK(!commited_ && !aborted_) << "Accessor committed or aborted";
// it's possible the vertex was removed already in this transaction
// due to it getting matched multiple times by some patterns
// we can only delete it once, so check if it's already deleted
if (vertex_accessor.GetCurrent()->is_expired_by(transaction_)) return true;
if (check_empty &&
vertex_accessor.out_degree() + vertex_accessor.in_degree() > 0)
return false;
auto *vlist_ptr = vertex_accessor.address().local();
transaction_.id_, vlist_ptr->gid_, check_empty));
vlist_ptr->remove(vertex_accessor.GetCurrent(), transaction_);
return true;
} else {
auto address = vertex_accessor.address();
updates_clients().RemoveVertex(worker_id(), address.worker_id(),
transaction_id(), address.gid(),
// We can't know if we are going to be able to remove vertex until
// deferred updates on a remote worker are executed
return true;
void GraphDbAccessor::DetachRemoveVertex(VertexAccessor &vertex_accessor) {
DCHECK(!commited_ && !aborted_) << "Accessor committed or aborted";
// Note that when we call RemoveEdge we must take care not to delete from the
// collection we are iterating over. This invalidates the iterator in a subtle
// way that does not fail in tests, but is NOT correct.
for (auto edge_accessor :
RemoveEdge(edge_accessor, true, false);
for (auto edge_accessor : vertex_accessor.out())
RemoveEdge(edge_accessor, false, true);
RemoveVertex(vertex_accessor, false);
EdgeAccessor GraphDbAccessor::InsertEdge(VertexAccessor &from,
VertexAccessor &to,
storage::EdgeType edge_type,
std::optional<gid::Gid> requested_gid,
std::optional<int64_t> cypher_id) {
DCHECK(!commited_ && !aborted_) << "Accessor committed or aborted";
auto edge_address =
InsertEdgeOnFrom(&from, &to, edge_type, requested_gid, cypher_id);
InsertEdgeOnTo(&from, &to, edge_type, edge_address);
return EdgeAccessor(edge_address, *this, from.address(), to.address(),
storage::EdgeAddress GraphDbAccessor::InsertEdgeOnFrom(
VertexAccessor *from, VertexAccessor *to,
const storage::EdgeType &edge_type,
const std::optional<gid::Gid> &requested_gid,
const std::optional<int64_t> &cypher_id) {
if (from->is_local()) {
auto edge_accessor = InsertOnlyEdge(from->address(), to->address(),
edge_type, requested_gid, cypher_id);
auto edge_address = edge_accessor.address();
auto from_updated = from->GetNew();
// TODO when preparing WAL for distributed, most likely never use
// `CREATE_EDGE`, but always have it split into 3 parts (edge insertion,
// in/out modification).
transaction_.id_, edge_accessor.gid(), edge_accessor.CypherId(),
from->gid(), to->gid(), edge_type, EdgeTypeName(edge_type)));
from_updated->out_.emplace(>address()), edge_address,
return edge_address;
} else {
auto created_edge_info = updates_clients().CreateEdge(
worker_id(), transaction_id(), *from, *to, edge_type, cypher_id);
auto edge_address = created_edge_info.edge_address;
auto guard = storage::GetDataLock(*from);
db().storage().LocalizedAddressIfPossible(to->address()), edge_address,
transaction_id(), edge_address.gid(),
created_edge_info.cypher_id, nullptr,
std::make_unique<Edge>(from->address(), to->address(), edge_type)));
return edge_address;
void GraphDbAccessor::InsertEdgeOnTo(VertexAccessor *from, VertexAccessor *to,
const storage::EdgeType &edge_type,
const storage::EdgeAddress &edge_address) {
if (to->is_local()) {
// Ensure that the "to" accessor has the latest version (switch new).
// WARNING: Must do that after the above "from->update()" for cases when
// we are creating a cycle and "from" and "to" are the same vlist.
auto *to_updated = to->GetNew();
to_updated->in_.emplace(>address()), edge_address,
} else {
// The RPC call for the `to` side is already handled if `from` is not
// local.
if (from->is_local() ||
from->address().worker_id() != to->address().worker_id()) {
worker_id(), transaction_id(), *from,
db().storage().GlobalizedAddress(edge_address), *to, edge_type);
auto guard = storage::GetDataLock(*to);
edge_address, edge_type);
EdgeAccessor GraphDbAccessor::InsertOnlyEdge(
storage::VertexAddress from, storage::VertexAddress to,
storage::EdgeType edge_type, std::optional<gid::Gid> requested_gid,
std::optional<int64_t> cypher_id) {
<< "`from` address should be local when calling InsertOnlyEdge";
auto gid =;
if (!cypher_id) cypher_id = utils::MemcpyCast<int64_t>(gid);
auto edge_vlist = new mvcc::VersionList<Edge>(transaction_, gid, *cypher_id,
from, to, edge_type);
// We need to insert edge_vlist to edges_ before calling update since update
// can throw and edge_vlist will not be garbage collected if it is not in
// edges_ skiplist.
bool success =, edge_vlist).second;
CHECK(success) << "Attempting to insert an edge with an existing GID: "
<< gid;
auto ea = EdgeAccessor(storage::EdgeAddress(edge_vlist), *this, from, to,
return ea;
int64_t GraphDbAccessor::EdgesCount() const {
DCHECK(!commited_ && !aborted_) << "Accessor committed or aborted";
void GraphDbAccessor::RemoveEdge(EdgeAccessor &edge, bool remove_out_edge,
bool remove_in_edge) {
if (edge.is_local()) {
DCHECK(!commited_ && !aborted_) << "Accessor committed or aborted";
// it's possible the edge was removed already in this transaction
// due to it getting matched multiple times by some patterns
// we can only delete it once, so check if it's already deleted
if (edge.GetCurrent()->is_expired_by(transaction_)) return;
if (remove_out_edge) edge.from().RemoveOutEdge(edge.address());
if (remove_in_edge);
edge.address().local()->remove(edge.GetCurrent(), transaction_);
database::StateDelta::RemoveEdge(transaction_.id_, edge.gid()));
} else {
auto edge_addr = edge.GlobalAddress();
auto from_addr = db().storage().GlobalizedAddress(edge.from_addr());
CHECK(edge_addr.worker_id() == from_addr.worker_id())
<< "Edge and it's 'from' vertex not on the same worker";
auto to_addr = db().storage().GlobalizedAddress(edge.to_addr());
updates_clients().RemoveEdge(worker_id(), edge_addr.worker_id(),
transaction_id(), edge_addr.gid(),
from_addr.gid(), to_addr);
// Another RPC is necessary only if the first did not handle vertices on
// both sides.
if (edge_addr.worker_id() != to_addr.worker_id()) {
updates_clients().RemoveInEdge(worker_id(), to_addr.worker_id(),
transaction_id(), to_addr.gid(),
storage::Label GraphDbAccessor::Label(const std::string &label_name) {
DCHECK(!commited_ && !aborted_) << "Accessor committed or aborted";
return db_.label_mapper().value_to_id(label_name);
const std::string &GraphDbAccessor::LabelName(storage::Label label) const {
DCHECK(!commited_ && !aborted_) << "Accessor committed or aborted";
return db_.label_mapper().id_to_value(label);
storage::EdgeType GraphDbAccessor::EdgeType(const std::string &edge_type_name) {
DCHECK(!commited_ && !aborted_) << "Accessor committed or aborted";
return db_.edge_type_mapper().value_to_id(edge_type_name);
const std::string &GraphDbAccessor::EdgeTypeName(
storage::EdgeType edge_type) const {
DCHECK(!commited_ && !aborted_) << "Accessor committed or aborted";
return db_.edge_type_mapper().id_to_value(edge_type);
storage::Property GraphDbAccessor::Property(const std::string &property_name) {
DCHECK(!commited_ && !aborted_) << "Accessor committed or aborted";
return db_.property_mapper().value_to_id(property_name);
const std::string &GraphDbAccessor::PropertyName(
storage::Property property) const {
DCHECK(!commited_ && !aborted_) << "Accessor committed or aborted";
return db_.property_mapper().id_to_value(property);
std::vector<std::string> GraphDbAccessor::IndexInfo() const {
std::vector<std::string> info;
for (storage::Label label : {
info.emplace_back(":" + LabelName(label));
for (LabelPropertyIndex::Key key : {
info.emplace_back(fmt::format(":{}({})", LabelName(key.label_),
return info;
} // namespace database

View File

@ -1,686 +0,0 @@
/// @file
#pragma once
#include <optional>
#include <string>
#include <vector>
#include <glog/logging.h>
#include <cppitertools/filter.hpp>
#include <cppitertools/imap.hpp>
#include "database/distributed/graph_db.hpp"
#include "storage/common/types/types.hpp"
#include "storage/distributed/address_types.hpp"
#include "storage/distributed/edge_accessor.hpp"
#include "storage/distributed/vertex_accessor.hpp"
#include "transactions/transaction.hpp"
#include "transactions/type.hpp"
#include "utils/bound.hpp"
#include "utils/exceptions.hpp"
namespace distributed {
class DataManager;
class UpdatesRpcClients;
} // namespace distributed
namespace database {
/** Thrown when inserting in an index with constraint. */
class ConstraintViolationException : public utils::BasicException {
using utils::BasicException::BasicException;
/** Thrown when creating an index which already exists. */
class IndexExistsException : public utils::BasicException {
using utils::BasicException::BasicException;
/** Thrown when creating an index which already exists. */
class IndexCreationOnWorkerException : public utils::BasicException {
using utils::BasicException::BasicException;
/// Thrown on concurrent index creation when the transaction engine fails to
/// start a new transaction.
class TransactionException : public utils::BasicException {
using utils::BasicException::BasicException;
* Base accessor for the database object: exposes functions for operating on the
* database. All the functions in this class should be self-sufficient: for
* example the function for creating a new Vertex should take care of all the
* book-keeping around the creation.
class GraphDbAccessor {
// We need to make friends with this guys since they need to access private
// methods for updating indices.
// TODO: Rethink this, we have too much long-distance friendship complicating
// the code.
friend class ::RecordAccessor<Vertex>;
friend class ::VertexAccessor;
// Construction should only be done through GraphDb::Access function and
// concrete GraphDbAccessor type.
/// Creates a new accessor by starting a new transaction.
explicit GraphDbAccessor(GraphDb &db);
/// Creates an accessor for a running transaction.
GraphDbAccessor(GraphDb &db, tx::TransactionId tx_id);
virtual ~GraphDbAccessor();
GraphDbAccessor(const GraphDbAccessor &other) = delete;
GraphDbAccessor(GraphDbAccessor &&other) = delete;
GraphDbAccessor &operator=(const GraphDbAccessor &other) = delete;
GraphDbAccessor &operator=(GraphDbAccessor &&other) = delete;
int16_t worker_id() const;
distributed::DataManager &data_manager();
distributed::UpdatesRpcClients &updates_clients();
* Creates a new Vertex and returns an accessor to it. If the ID is
* provided, the created Vertex will have that local ID, and the ID counter
* will be increased to it so collisions are avoided. This should only be used
* by durability recovery, normal vertex creation should not provide the ID.
* You should NOT make interleaved recovery and normal DB op calls to this
* function. Doing so will likely mess up the ID generation and crash MG.
* Always perform recovery only once, immediately when the database is
* created, before any transactional ops start.
* @param requested_gid The requested GID. Should only be provided when
* recovering from durability.
* @param cypher_id Take a look under mvcc::VersionList::cypher_id
* @return See above.
VertexAccessor InsertVertex(
std::optional<gid::Gid> requested_gid = std::nullopt,
std::optional<int64_t> cypher_id = std::nullopt);
* Removes the vertex of the given accessor. If the vertex has any outgoing or
* incoming edges, it is not deleted. See `DetachRemoveVertex` if you want to
* remove a vertex regardless of connectivity.
* If the vertex has already been deleted by the current transaction+command,
* this function will not do anything and will return true.
* @param vertex_accessor Accessor to vertex.
* @param check_empty If the vertex should be checked for existing edges
* before deletion.
* @return If or not the vertex was deleted.
bool RemoveVertex(VertexAccessor &vertex_accessor,
bool check_empty = true);
* Removes the vertex of the given accessor along with all it's outgoing
* and incoming connections.
* @param vertex_accessor Accessor to a vertex.
void DetachRemoveVertex(VertexAccessor &vertex_accessor);
* Obtains the vertex for the given ID. If there is no vertex for the given
* ID, or it's not visible to this accessor's transaction, nullopt is
* returned.
* @param gid - The GID of the sought vertex.
* @param current_state If true then the graph state for the
* current transaction+command is returned (insertions, updates and
* deletions performed in the current transaction+command are not
* ignored).
std::optional<VertexAccessor> FindVertexOptional(gid::Gid gid,
bool current_state);
* Obtains the vertex accessor for given id without checking if the
* vertex is visible.
VertexAccessor FindVertexRaw(gid::Gid gid);
* Obtains the vertex for the given ID. If there is no vertex for the given
* ID, or it's not visible to this accessor's transaction, MG is crashed
* using a CHECK.
* @param gid - The GID of the sought vertex.
* @param current_state If true then the graph state for the
* current transaction+command is returned (insertions, updates and
* deletions performed in the current transaction+command are not
* ignored).
VertexAccessor FindVertex(gid::Gid gid, bool current_state);
* Returns iterable over accessors to all the vertices in the graph
* visible to the current transaction.
* @param current_state If true then the graph state for the
* current transaction+command is returned (insertions, updates and
* deletions performed in the current transaction+command are not
* ignored).
auto Vertices(bool current_state) {
DCHECK(!commited_ && !aborted_) << "Accessor committed or aborted";
// wrap version lists into accessors, which will look for visible versions
auto accessors = iter::imap(
[this](auto id_vlist) {
return VertexAccessor(storage::VertexAddress(id_vlist.second), *this);
// filter out the accessors not visible to the current transaction
return iter::filter(
[this, current_state](const VertexAccessor &accessor) {
return accessor.Visible(transaction(), current_state);
* Return VertexAccessors which contain the current label for the current
* transaction visibilty.
* @param label - label for which to return VertexAccessors
* @param current_state If true then the graph state for the
* current transaction+command is returned (insertions, updates and
* deletions performed in the current transaction+command are not
* ignored).
* @return iterable collection
auto Vertices(storage::Label label, bool current_state) {
DCHECK(!commited_ && !aborted_) << "Accessor committed or aborted";
return iter::imap(
[this](auto vlist) {
return VertexAccessor(storage::VertexAddress(vlist), *this);
},, transaction_,
* Return VertexAccessors which contain the current label and property for the
* given transaction visibility.
* @param label - label for which to return VertexAccessors
* @param property - property for which to return VertexAccessors
* @param current_state If true then the graph state for the
* current transaction+command is returned (insertions, updates and
* deletions performed in the current transaction+command are not
* ignored).
* @return iterable collection
auto Vertices(storage::Label label, storage::Property property,
bool current_state) {
DCHECK(!commited_ && !aborted_) << "Accessor committed or aborted";
LabelPropertyIndex::Key(label, property)))
<< "Label+property index doesn't exist.";
return iter::imap(
[this](auto vlist) {
return VertexAccessor(storage::VertexAddress(vlist), *this);
LabelPropertyIndex::Key(label, property), transaction_,
* Return VertexAccessors which contain the current label + property, and
* those properties are equal to this 'value' for the given transaction
* visibility.
* @param label - label for which to return VertexAccessors
* @param property - property for which to return VertexAccessors
* @param value - property value for which to return VertexAccessors
* @param current_state If true then the graph state for the
* current transaction+command is returned (insertions, updates and
* deletions performed in the current transaction+command are not
* ignored).
* @return iterable collection
auto Vertices(storage::Label label, storage::Property property,
const PropertyValue &value, bool current_state) {
DCHECK(!commited_ && !aborted_) << "Accessor committed or aborted";
LabelPropertyIndex::Key(label, property)))
<< "Label+property index doesn't exist.";
CHECK(value.type() != PropertyValue::Type::Null)
<< "Can't query index for propery value type null.";
return iter::imap(
[this](auto vlist) {
return VertexAccessor(storage::VertexAddress(vlist), *this);
LabelPropertyIndex::Key(label, property), value, transaction_,
* Return an iterable over VertexAccessors which contain the
* given label and whose property value (for the given property)
* falls within the given (lower, upper) @c Bound.
* The returned iterator will only contain
* vertices/edges whose property value is comparable with the
* given bounds (w.r.t. type). This has implications on Cypher
* query execuction semantics which have not been resovled yet.
* At least one of the bounds must be specified. Bonds can't be
* @c PropertyValue::Null. If both bounds are
* specified, their PropertyValue elments must be of comparable
* types.
* @param label - label for which to return VertexAccessors
* @param property - property for which to return VertexAccessors
* @param lower - Lower bound of the interval.
* @param upper - Upper bound of the interval.
* @param value - property value for which to return VertexAccessors
* @param current_state If true then the graph state for the
* current transaction+command is returned (insertions, updates and
* deletions performed in the current transaction+command are not
* ignored).
* @return iterable collection of record accessors
* satisfy the bounds and are visible to the current transaction.
auto Vertices(storage::Label label, storage::Property property,
const std::optional<utils::Bound<PropertyValue>> lower,
const std::optional<utils::Bound<PropertyValue>> upper,
bool current_state) {
DCHECK(!commited_ && !aborted_) << "Accessor committed or aborted";
LabelPropertyIndex::Key(label, property)))
<< "Label+property index doesn't exist.";
return iter::imap(
[this](auto vlist) {
return VertexAccessor(storage::VertexAddress(vlist), *this);
LabelPropertyIndex::Key(label, property), lower, upper,
transaction_, current_state));
* Creates a new Edge and returns an accessor to it. If the ID is
* provided, the created Edge will have that ID, and the ID counter will be
* increased to it so collisions are avoided. This should only be used by
* durability recovery, normal edge creation should not provide the ID.
* You should NOT make interleaved recovery and normal DB op calls to this
* function. Doing so will likely mess up the ID generation and crash MG.
* Always perform recovery only once, immediately when the database is
* created, before any transactional ops start.
* @param from The 'from' vertex.
* @param to The 'to' vertex'
* @param type Edge type.
* @param requested_gid The requested GID. Should only be provided when
* recovering from durability.
* @param cypher_id Take a look under mvcc::VersionList::cypher_id
* @return An accessor to the edge.
EdgeAccessor InsertEdge(VertexAccessor &from, VertexAccessor &to,
storage::EdgeType type,
std::optional<gid::Gid> requested_gid = std::nullopt,
std::optional<int64_t> cypher_id = std::nullopt);
* Insert edge into main storage, but don't insert it into from and to
* vertices edge lists.
* @param cypher_id Take a look under mvcc::VersionList::cypher_id
EdgeAccessor InsertOnlyEdge(
storage::VertexAddress from, storage::VertexAddress to,
storage::EdgeType edge_type,
std::optional<gid::Gid> requested_gid = std::nullopt,
std::optional<int64_t> cypher_id = std::nullopt);
* Removes an edge from the graph. Parameters can indicate if the edge should
* be removed from data structures in vertices it connects. When removing an
* edge both arguments should be `true`. `false` is only used when
* detach-deleting a vertex.
* @param edge The accessor to an edge.
* @param remove_out_edge If the edge should be removed from the its origin
* side.
* @param remove_in_edge If the edge should be removed from the its
* destination side.
void RemoveEdge(EdgeAccessor &edge, bool remove_out_edge = true,
bool remove_in_edge = true);
* Obtains the edge for the given ID. If there is no edge for the given
* ID, or it's not visible to this accessor's transaction, nullopt is
* returned.
* @param gid - The GID of the sought edge.
* @param current_state If true then the graph state for the
* current transaction+command is returned (insertions, updates and
* deletions performed in the current transaction+command are not
* ignored).
std::optional<EdgeAccessor> FindEdgeOptional(gid::Gid gid,
bool current_state);
* Obtains the edge accessor for the given id without checking if the edge
* is visible.
EdgeAccessor FindEdgeRaw(gid::Gid gid);
* Obtains the edge for the given ID. If there is no edge for the given
* ID, or it's not visible to this accessor's transaction, MG is crashed
* using a CHECK.
* @param gid - The GID of the sought edge.
* @param current_state If true then the graph state for the
* current transaction+command is returned (insertions, updates and
* deletions performed in the current transaction+command are not
* ignored).
EdgeAccessor FindEdge(gid::Gid gid, bool current_state);
* Returns iterable over accessors to all the edges in the graph
* visible to the current transaction.
* @param current_state If true then the graph state for the
* current transaction+command is returned (insertions, updates and
* deletions performed in the current transaction+command are not
* ignored).
auto Edges(bool current_state) {
DCHECK(!commited_ && !aborted_) << "Accessor committed or aborted";
// wrap version lists into accessors, which will look for visible versions
auto accessors = iter::imap(
[this](auto id_vlist) {
return EdgeAccessor(storage::EdgeAddress(id_vlist.second), *this);
// filter out the accessors not visible to the current transaction
return iter::filter(
[this, current_state](const EdgeAccessor &accessor) {
return accessor.Visible(transaction(), current_state);
* Creates and returns a new accessor that represents the same graph element
* (node / version) as the given `accessor`, but in this `GraphDbAccessor`.
* It is possible that the given `accessor` graph element is not visible in
* this `GraphDbAccessor`'s transaction. If that is the case, a `nullopt` is
* returned.
* The returned accessor does NOT have the same `current_` set as the given
* `accessor`. It has default post-construction `current_` set (`old` if
* available, otherwise `new`).
* @param accessor The [Vertex/Edge]Accessor whose underlying graph element we
* want in this GraphDbAccessor.
* @return See above.
* @tparam TAccessor Either VertexAccessor or EdgeAccessor
template <typename TAccessor>
std::optional<TAccessor> Transfer(const TAccessor &accessor) {
if (accessor.db_accessor_ == this) return std::make_optional(accessor);
TAccessor accessor_in_this(accessor.address(), *this);
if (accessor_in_this.current_)
return std::make_optional(std::move(accessor_in_this));
return std::nullopt;
* Adds an index for the given (label, property) and populates it with
* existing vertices that belong to it.
* You should never call BuildIndex on a GraphDbAccessor (transaction) on
* which new vertices have been inserted or existing ones updated. Do it
* in a new accessor instead.
* Build index throws if an index for the given (label, property) already
* exists (even if it's being built by a concurrent transaction and is not yet
* ready for use).
* It also throws if there is another index being built concurrently on the
* same database this accessor is for.
* @param label - label to build for
* @param property - property to build for
virtual void BuildIndex(storage::Label label, storage::Property property);
/// Deletes the index responisble for (label, property).
/// At the moment this isn't implemented in distributed.
/// @throws NotYetImplemented
void DeleteIndex(storage::Label, storage::Property);
/// Populates index with vertices containing the key
void PopulateIndex(const LabelPropertyIndex::Key &key);
/// Writes Index (key) creation to wal, marks it as ready for usage
void EnableIndex(const LabelPropertyIndex::Key &key);
* @brief - Returns true if the given label+property index already exists and
* is ready for use.
bool LabelPropertyIndexExists(storage::Label label,
storage::Property property) const {
DCHECK(!commited_ && !aborted_) << "Accessor committed or aborted";
LabelPropertyIndex::Key(label, property));
* @brief - Returns vector of keys of label-property indices.
std::vector<LabelPropertyIndex::Key> GetIndicesKeys() {
DCHECK(!commited_ && !aborted_) << "Accessor committed or aborted";
* Return approximate number of all vertices in the database.
* Note that this is always an over-estimate and never an under-estimate.
int64_t VerticesCount() const;
* Return approximate number of all edges in the database.
* Note that this is always an over-estimate and never an under-estimate.
int64_t EdgesCount() const;
* Return approximate number of vertices under indexes with the given label.
* Note that this is always an over-estimate and never an under-estimate.
* @param label - label to check for
* @return number of vertices with the given label
int64_t VerticesCount(storage::Label label) const;
* Return approximate number of vertices under indexes with the given label
* and property. Note that this is always an over-estimate and never an
* under-estimate.
* @param label - label to check for
* @param property - property to check for
* @return number of vertices with the given label, fails if no such
* label+property index exists.
int64_t VerticesCount(storage::Label label, storage::Property property) const;
* Returns approximate number of vertices that have the given label
* and the given value for the given property.
* Assumes that an index for that (label, property) exists.
int64_t VerticesCount(storage::Label label, storage::Property property,
const PropertyValue &value) const;
* Returns approximate number of vertices that have the given label
* and whose vaue is in the range defined by upper and lower @c Bound.
* At least one bound must be specified. Neither can be
* PropertyValue::Null.
* Assumes that an index for that (label, property) exists.
int64_t VerticesCount(
storage::Label label, storage::Property property,
const std::optional<utils::Bound<PropertyValue>> lower,
const std::optional<utils::Bound<PropertyValue>> upper) const;
* Obtains the Label for the label's name.
* @return See above.
storage::Label Label(const std::string &label_name);
* Obtains the label name (a string) for the given label.
* @param label a Label.
* @return See above.
const std::string &LabelName(storage::Label label) const;
* Obtains the EdgeType for it's name.
* @return See above.
storage::EdgeType EdgeType(const std::string &edge_type_name);
* Obtains the edge type name (a string) for the given edge type.
* @param edge_type an EdgeType.
* @return See above.
const std::string &EdgeTypeName(storage::EdgeType edge_type) const;
* Obtains the Property for it's name.
* @return See above.
storage::Property Property(const std::string &property_name);
* Obtains the property name (a string) for the given property.
* @param property a Property.
* @return See above.
const std::string &PropertyName(storage::Property property) const;
/** Returns the id of this accessor's transaction */
tx::TransactionId transaction_id() const;
/** Advances transaction's command id by 1. */
virtual void AdvanceCommand();
/** Commit transaction. */
void Commit();
/** Abort transaction. */
void Abort();
/** Return true if transaction is hinted to abort. */
bool should_abort() const;
const tx::Transaction &transaction() const { return transaction_; }
durability::WriteAheadLog &wal();
auto &db() { return db_; }
const auto &db() const { return db_; }
/* Returns a list of index names present in the database. */
std::vector<std::string> IndexInfo() const;
* Insert this vertex into corresponding label and label+property (if it
* exists) index.
* @param label - label with which to insert vertex label record
* @param vertex_accessor - vertex_accessor to insert
* @param vertex - vertex record to insert
void UpdateLabelIndices(storage::Label label,
const VertexAccessor &vertex_accessor,
const Vertex *const vertex);
/** Called in `BuildIndex` after creating an index, but before populating. */
virtual void PostCreateIndex(const LabelPropertyIndex::Key &key) {}
/** Populates the index from a *new* transaction after creating the index. */
virtual void PopulateIndexFromBuildIndex(const LabelPropertyIndex::Key &key) {
* Insert a new edge to `from` vertex and return the address.
* Called from `InsertEdge` as the first step in edge insertion.
* */
storage::EdgeAddress InsertEdgeOnFrom(
VertexAccessor *from, VertexAccessor *to,
const storage::EdgeType &edge_type,
const std::optional<gid::Gid> &requested_gid,
const std::optional<int64_t> &cypher_id);
* Set the newly created edge on `to` vertex.
* Called after `InsertEdgeOnFrom` in `InsertEdge`. The given `edge_address`
* is from the created edge, returned by `InsertEdgeOnFrom`.
void InsertEdgeOnTo(VertexAccessor *from, VertexAccessor *to,
const storage::EdgeType &edge_type,
const storage::EdgeAddress &edge_address);
GraphDb &db_;
tx::Transaction &transaction_;
// Indicates if this db-accessor started the transaction and should Abort it
// upon destruction.
bool transaction_starter_;
bool commited_{false};
bool aborted_{false};
* Insert this vertex into corresponding any label + 'property' index.
* @param property - vertex will be inserted into indexes which contain this
* property
* @param vertex_accessor - vertex accessor to insert
* @param vertex - vertex to insert
void UpdatePropertyIndex(storage::Property property,
const RecordAccessor<Vertex> &vertex_accessor,
const Vertex *const vertex);
} // namespace database

View File

@ -1,9 +0,0 @@
#pragma once
#include "durability/distributed/state_delta.hpp"
#include "storage/distributed/rpc/serialization.hpp"
;; Generate serialization of state-delta
(load "durability/distributed/state_delta.lcp")

View File

@ -7,7 +7,3 @@
#include "database/single_node_ha/graph_db.hpp"
#include "database/distributed/graph_db.hpp"

View File

@ -7,7 +7,3 @@
#include "database/single_node_ha/graph_db_accessor.hpp"
#include "database/distributed/graph_db_accessor.hpp"

View File

@ -1,197 +0,0 @@
#include "bfs_rpc_clients.hpp"
#include "database/distributed/graph_db.hpp"
#include "distributed/bfs_rpc_messages.hpp"
#include "distributed/data_manager.hpp"
namespace distributed {
BfsRpcClients::BfsRpcClients(database::GraphDb *db,
BfsSubcursorStorage *subcursor_storage,
Coordination *coordination,
DataManager *data_manager)
: db_(db),
data_manager_(data_manager) {}
std::unordered_map<int16_t, int64_t> BfsRpcClients::CreateBfsSubcursors(
database::GraphDbAccessor *dba, query::EdgeAtom::Direction direction,
const std::vector<storage::EdgeType> &edge_types,
const query::plan::ExpansionLambda &filter_lambda,
const query::SymbolTable &symbol_table,
const query::EvaluationContext &evaluation_context) {
auto futures = coordination_->ExecuteOnWorkers<std::pair<int16_t, int64_t>>(
db_->WorkerId(), [&](int worker_id, auto &client) {
auto res = client.template Call<CreateBfsSubcursorRpc>(
dba->transaction_id(), direction, edge_types, filter_lambda,
symbol_table, evaluation_context.timestamp,
return std::make_pair(worker_id, res.member);
std::unordered_map<int16_t, int64_t> subcursor_ids;
subcursor_storage_->Create(dba, direction, edge_types, symbol_table,
nullptr, filter_lambda, evaluation_context));
for (auto &future : futures) {
auto got = subcursor_ids.emplace(future.get());
CHECK(got.second) << "CreateBfsSubcursors failed: duplicate worker id";
return subcursor_ids;
void BfsRpcClients::RegisterSubcursors(
const std::unordered_map<int16_t, int64_t> &subcursor_ids) {
auto futures = coordination_->ExecuteOnWorkers<void>(
db_->WorkerId(), [&subcursor_ids](int worker_id, auto &client) {
client.template Call<RegisterSubcursorsRpc>(subcursor_ids);
// Wait and get all of the replies.
for (auto &future : futures) {
if (future.valid()) future.get();
void BfsRpcClients::ResetSubcursors(
const std::unordered_map<int16_t, int64_t> &subcursor_ids) {
auto futures = coordination_->ExecuteOnWorkers<void>(
db_->WorkerId(), [&subcursor_ids](int worker_id, auto &client) {
client.template Call<ResetSubcursorRpc>(;
// Wait and get all of the replies.
for (auto &future : futures) {
if (future.valid()) future.get();
std::optional<VertexAccessor> BfsRpcClients::Pull(
int16_t worker_id, int64_t subcursor_id, database::GraphDbAccessor *dba) {
if (worker_id == db_->WorkerId()) {
return subcursor_storage_->Get(subcursor_id)->Pull();
auto res =
[this, dba](auto *res_reader) {
SubcursorPullRes res;
slk::Load(&res, res_reader, dba, this->data_manager_);
return res;
return res.vertex;
bool BfsRpcClients::ExpandLevel(
const std::unordered_map<int16_t, int64_t> &subcursor_ids) {
auto futures = coordination_->ExecuteOnWorkers<bool>(
db_->WorkerId(), [&subcursor_ids](int worker_id, auto &client) {
auto res =
client.template Call<ExpandLevelRpc>(;
switch (res.result) {
case ExpandResult::SUCCESS:
return true;
case ExpandResult::FAILURE:
return false;
case ExpandResult::LAMBDA_ERROR:
throw query::QueryRuntimeException(
"Expansion condition must evaluate to boolean or null");
bool expanded =
for (auto &future : futures) {
expanded |= future.get();
return expanded;
void BfsRpcClients::SetSource(
const std::unordered_map<int16_t, int64_t> &subcursor_ids,
storage::VertexAddress source_address) {
<< "SetSource should be called with global address";
int worker_id = source_address.worker_id();
if (worker_id == db_->WorkerId()) {
} else {
coordination_->GetClientPool(worker_id)->Call<SetSourceRpc>(, source_address);
bool BfsRpcClients::ExpandToRemoteVertex(
const std::unordered_map<int16_t, int64_t> &subcursor_ids,
EdgeAccessor edge, VertexAccessor vertex) {
<< "ExpandToRemoteVertex should not be called with local vertex";
int worker_id = vertex.address().worker_id();
auto res =
coordination_->GetClientPool(worker_id)->Call<ExpandToRemoteVertexRpc>(, edge.GlobalAddress(),
return res.member;
PathSegment BfsRpcClients::ReconstructPath(
const std::unordered_map<int16_t, int64_t> &subcursor_ids,
storage::VertexAddress vertex, database::GraphDbAccessor *dba) {
int worker_id = vertex.worker_id();
if (worker_id == db_->WorkerId()) {
return subcursor_storage_->Get(
auto res =
[this, dba](auto *res_reader) {
ReconstructPathRes res;
slk::Load(&res, res_reader, dba, this->data_manager_);
return res;
},, vertex);
return PathSegment{res.edges, res.next_vertex, res.next_edge};
PathSegment BfsRpcClients::ReconstructPath(
const std::unordered_map<int16_t, int64_t> &subcursor_ids,
storage::EdgeAddress edge, database::GraphDbAccessor *dba) {
int worker_id = edge.worker_id();
if (worker_id == db_->WorkerId()) {
return subcursor_storage_->Get(
auto res =
[this, dba](auto *res_reader) {
ReconstructPathRes res;
slk::Load(&res, res_reader, dba, this->data_manager_);
return res;
},, edge);
return PathSegment{res.edges, res.next_vertex, res.next_edge};
void BfsRpcClients::PrepareForExpand(
const std::unordered_map<int16_t, int64_t> &subcursor_ids, bool clear,
const std::vector<query::TypedValue> &frame) {
auto futures = coordination_->ExecuteOnWorkers<void>(
[this, clear, &frame, &subcursor_ids](int worker_id, auto &client) {
client.template Call<PrepareForExpandRpc>(, clear, frame, db_->WorkerId());
->PrepareForExpand(clear, frame);
// Wait and get all of the replies.
for (auto &future : futures) {
if (future.valid()) future.get();
} // namespace distributed

View File

@ -1,73 +0,0 @@
/// @file
#pragma once
#include "distributed/bfs_subcursor.hpp"
#include "distributed/coordination.hpp"
#include "transactions/transaction.hpp"
namespace database {
class GraphDb;
namespace distributed {
class DataManager;
/// Along with `BfsRpcServer`, this class is used to expose `BfsSubcursor`
/// interface over the network so that subcursors can communicate during the
/// traversal. It is just a thin wrapper making RPC calls that also takes
/// care for storing remote data into cache upon receival. Special care is taken
/// to avoid sending local RPCs. Instead, subcursor storage is accessed
/// directly.
class BfsRpcClients {
BfsRpcClients(database::GraphDb *db,
BfsSubcursorStorage *subcursor_storage,
Coordination *coordination, DataManager *data_manager);
std::unordered_map<int16_t, int64_t> CreateBfsSubcursors(
database::GraphDbAccessor *dba, query::EdgeAtom::Direction direction,
const std::vector<storage::EdgeType> &edge_types,
const query::plan::ExpansionLambda &filter_lambda,
const query::SymbolTable &symbol_table,
const query::EvaluationContext &evaluation_context);
void RegisterSubcursors(
const std::unordered_map<int16_t, int64_t> &subcursor_ids);
void ResetSubcursors(
const std::unordered_map<int16_t, int64_t> &subcursor_ids);
std::optional<VertexAccessor> Pull(int16_t worker_id, int64_t subcursor_id,
database::GraphDbAccessor *dba);
bool ExpandLevel(const std::unordered_map<int16_t, int64_t> &subcursor_ids);
void SetSource(const std::unordered_map<int16_t, int64_t> &subcursor_ids,
storage::VertexAddress source_address);
bool ExpandToRemoteVertex(
const std::unordered_map<int16_t, int64_t> &subcursor_ids,
EdgeAccessor edge, VertexAccessor vertex);
PathSegment ReconstructPath(
const std::unordered_map<int16_t, int64_t> &subcursor_ids,
storage::EdgeAddress edge, database::GraphDbAccessor *dba);
PathSegment ReconstructPath(
const std::unordered_map<int16_t, int64_t> &subcursor_ids,
storage::VertexAddress vertex, database::GraphDbAccessor *dba);
void PrepareForExpand(
const std::unordered_map<int16_t, int64_t> &subcursor_ids, bool clear,
const std::vector<query::TypedValue> &frame);
database::GraphDb *db_;
distributed::BfsSubcursorStorage *subcursor_storage_;
distributed::Coordination *coordination_;
distributed::DataManager *data_manager_;
} // namespace distributed

View File

@ -1,170 +0,0 @@
#pragma once
#include <tuple>
#include "communication/rpc/messages.hpp"
#include "distributed/bfs_subcursor.hpp"
#include "query/frontend/semantic/symbol_table.hpp"
#include "query/distributed/plan/ops.hpp"
#include "query/distributed/serialization.hpp"
#include "storage/distributed/rpc/serialization.hpp"
#include "transactions/type.hpp"
(lcp:namespace distributed)
(lcp:define-rpc create-bfs-subcursor
((tx-id "::tx::TransactionId")
(direction "::query::EdgeAtom::Direction")
(edge-types "std::vector<storage::EdgeType>")
(filter-lambda "::query::plan::ExpansionLambda"
:slk-load (lambda (member)
slk::Load(&self->${member}, reader, ast_storage);
(symbol-table "::query::SymbolTable")
(timestamp :int64_t)
(parameters "::query::Parameters"))
(:serialize (:slk :load-args '((ast-storage "::query::AstStorage *")))))
(:response ((member :int64_t))))
(lcp:define-rpc register-subcursors
(:request ((subcursor-ids "std::unordered_map<int16_t, int64_t>")))
(:response ()))
(lcp:define-rpc reset-subcursor
(:request ((subcursor-id :int64_t)))
(:response ()))
(lcp:define-enum expand-result
(success failure lambda-error)
(lcp:define-rpc expand-level
(:request ((member :int64_t)))
(:response ((result "ExpandResult"))))
(lcp:define-rpc subcursor-pull
(:request ((member :int64_t)))
((vertex "std::optional<VertexAccessor>"
:slk-save (lambda (member)
slk::Save(static_cast<bool>(self.${member}), builder);
if (self.${member}) {
slk::Save(*self.${member}, builder,
storage::SendVersions::BOTH, worker_id);
:slk-load (lambda (member)
bool has_value;
slk::Load(&has_value, reader);
if (has_value) {
self->${member} = slk::LoadVertexAccessor(reader, dba, data_manager);
(:serialize (:slk :save-args '((worker-id :int16_t))
:load-args '((dba "::database::GraphDbAccessor *")
(data-manager "::distributed::DataManager *"))))))
(lcp:define-rpc set-source
((subcursor-id :int64_t)
(source "::storage::VertexAddress")))
(:response ()))
(lcp:define-rpc expand-to-remote-vertex
((subcursor-id :int64_t)
(edge "::storage::EdgeAddress")
(vertex "::storage::VertexAddress")))
(:response ((member :bool))))
(lcp:define-rpc reconstruct-path
((subcursor-id :int64_t)
(vertex "std::optional<storage::VertexAddress>")
(edge "std::optional<storage::EdgeAddress>"))
ReconstructPathReq(int64_t subcursor_id, storage::VertexAddress vertex)
: subcursor_id(subcursor_id),
edge(std::nullopt) {}
ReconstructPathReq(int64_t subcursor_id, storage::EdgeAddress edge)
: subcursor_id(subcursor_id),
edge(edge) {}
((edges "std::vector<EdgeAccessor>"
:slk-save (lambda (member)
size_t size = self.${member}.size();
slk::Save(size, builder);
for (const auto &v : self.${member}) {
slk::Save(v, builder, storage::SendVersions::BOTH, worker_id);
:slk-load (lambda (member)
size_t size;
slk::Load(&size, reader);
for (size_t i = 0; i < size; ++i) {
self->${member}.push_back(slk::LoadEdgeAccessor(reader, dba, data_manager));
(next-vertex "std::optional<storage::VertexAddress>")
(next-edge "std::optional<storage::EdgeAddress>"))
(:serialize (:slk :save-args '((worker-id :int16_t))
:load-args '((dba "database::GraphDbAccessor *")
(data-manager "distributed::DataManager *"))))
(:ctor nil)
ReconstructPathRes() {}
const std::vector<EdgeAccessor> &edges,
std::optional<storage::VertexAddress> next_vertex,
std::optional<storage::EdgeAddress> next_edge)
: edges(edges), next_vertex(std::move(next_vertex)), next_edge(std::move(next_edge)) {
CHECK(!static_cast<bool>(next_vertex) || !static_cast<bool>(next_edge))
<< "At most one of `next_vertex` and `next_edge` should be set";
(lcp:define-rpc prepare-for-expand
((subcursor-id :int64_t)
(clear :bool)
(frame "std::vector<query::TypedValue>"
:slk-save (lambda (member)
size_t size = self.${member}.size();
slk::Save(size, builder);
for (const auto &v : self.${member}) {
slk::Save(v, builder, storage::SendVersions::ONLY_OLD, self.worker_id);
:slk-load (lambda (member)
auto *subcursor = subcursor_storage->Get(self->subcursor_id);
size_t size;
slk::Load(&size, reader);
for (size_t i = 0; i < size; ++i) {
slk::Load(&self->${member}[i], reader, subcursor->db_accessor(), data_manager);
(worker-id :int16_t :dont-save t))
(:serialize (:slk :load-args '((subcursor_storage "distributed::BfsSubcursorStorage *")
(data-manager "distributed::DataManager *")))))
(:response ()))
(lcp:pop-namespace) ;; distributed

View File

@ -1,166 +0,0 @@
/// @file
#pragma once
#include <map>
#include <mutex>
#include "distributed/bfs_rpc_messages.hpp"
#include "distributed/bfs_subcursor.hpp"
#include "distributed/coordination.hpp"
namespace distributed {
/// Along with `BfsRpcClients`, this class is used to expose `BfsSubcursor`
/// interface over the network so that subcursors can communicate during the
/// traversal. It is just a thin wrapper forwarding RPC calls to subcursors in
/// subcursor storage.
class BfsRpcServer {
BfsRpcServer(database::GraphDb *db,
distributed::Coordination *coordination,
BfsSubcursorStorage *subcursor_storage)
: db_(db), subcursor_storage_(subcursor_storage) {
coordination->Register<CreateBfsSubcursorRpc>([this](auto *req_reader,
auto *res_builder) {
CreateBfsSubcursorReq req;
auto ast_storage = std::make_unique<query::AstStorage>();
slk::Load(&req, req_reader, ast_storage.get());
database::GraphDbAccessor *dba;
std::lock_guard<std::mutex> guard(lock_);
auto it = db_accessors_.find(req.tx_id);
if (it == db_accessors_.end()) {
it = db_accessors_.emplace(req.tx_id, db_->Access(req.tx_id)).first;
dba = it->second.get();
query::EvaluationContext evaluation_context;
evaluation_context.timestamp = req.timestamp;
evaluation_context.parameters = req.parameters; =
query::NamesToProperties(ast_storage->properties_, dba);
evaluation_context.labels =
query::NamesToLabels(ast_storage->labels_, dba);
auto id = subcursor_storage_->Create(
dba, req.direction, req.edge_types, std::move(req.symbol_table),
std::move(ast_storage), req.filter_lambda, evaluation_context);
CreateBfsSubcursorRes res(id);
slk::Save(res, res_builder);
[this](auto *req_reader, auto *res_builder) {
RegisterSubcursorsReq req;
slk::Load(&req, req_reader);
RegisterSubcursorsRes res;
slk::Save(res, res_builder);
[this](auto *req_reader, auto *res_builder) {
ResetSubcursorReq req;
slk::Load(&req, req_reader);
ResetSubcursorRes res;
slk::Save(res, res_builder);
[this](auto *req_reader, auto *res_builder) {
SetSourceReq req;
slk::Load(&req, req_reader);
SetSourceRes res;
slk::Save(res, res_builder);
[this](auto *req_reader, auto *res_builder) {
ExpandLevelReq req;
slk::Load(&req, req_reader);
auto subcursor = subcursor_storage_->Get(req.member);
ExpandResult result;
try {
result = subcursor->ExpandLevel() ? ExpandResult::SUCCESS
: ExpandResult::FAILURE;
} catch (const query::QueryRuntimeException &) {
result = ExpandResult::LAMBDA_ERROR;
ExpandLevelRes res(result);
slk::Save(res, res_builder);
[this](auto *req_reader, auto *res_builder) {
SubcursorPullReq req;
slk::Load(&req, req_reader);
auto vertex = subcursor_storage_->Get(req.member)->Pull();
SubcursorPullRes res(vertex);
slk::Save(res, res_builder, db_->WorkerId());
[this](auto *req_reader, auto *res_builder) {
ExpandToRemoteVertexReq req;
slk::Load(&req, req_reader);
ExpandToRemoteVertexRes res(
->ExpandToLocalVertex(req.edge, req.vertex));
slk::Save(res, res_builder);
coordination->Register<ReconstructPathRpc>([this](auto *req_reader,
auto *res_builder) {
ReconstructPathReq req;
slk::Load(&req, req_reader);
auto subcursor = subcursor_storage_->Get(req.subcursor_id);
PathSegment result;
if (req.vertex) {
result = subcursor->ReconstructPath(*req.vertex);
} else if (req.edge) {
result = subcursor->ReconstructPath(*req.edge);
} else {
LOG(FATAL) << "`edge` or `vertex` should be set in ReconstructPathReq";
ReconstructPathRes res(result.edges, result.next_vertex,
slk::Save(res, res_builder, db_->WorkerId());
coordination->Register<PrepareForExpandRpc>([this](auto *req_reader,
auto *res_builder) {
PrepareForExpandReq req;
slk::Load(&req, req_reader, subcursor_storage_, &db_->data_manager());
auto *subcursor = subcursor_storage_->Get(req.subcursor_id);
subcursor->PrepareForExpand(req.clear, std::move(req.frame));
PrepareForExpandRes res;
slk::Save(res, res_builder);
void ClearTransactionalCache(tx::TransactionId oldest_active) {
// It is unlikely this will become a performance issue, but if it does, we
// should store database accessors in a lock-free map.
std::lock_guard<std::mutex> guard(lock_);
for (auto it = db_accessors_.begin(); it != db_accessors_.end();) {
if (it->first < oldest_active) {
it = db_accessors_.erase(it);
} else {
database::GraphDb *db_;
std::mutex lock_;
std::map<tx::TransactionId, std::unique_ptr<database::GraphDbAccessor>>
BfsSubcursorStorage *subcursor_storage_;
} // namespace distributed

View File

@ -1,231 +0,0 @@
#include "bfs_subcursor.hpp"
#include <unordered_map>
#include "database/distributed/graph_db.hpp"
#include "distributed/bfs_rpc_clients.hpp"
#include "query/exceptions.hpp"
#include "query/plan/operator.hpp"
#include "storage/distributed/address_types.hpp"
#include "storage/vertex_accessor.hpp"
namespace distributed {
using query::TypedValue;
database::GraphDbAccessor *dba, query::EdgeAtom::Direction direction,
std::vector<storage::EdgeType> edge_types, query::SymbolTable symbol_table,
std::unique_ptr<query::AstStorage> ast_storage,
query::plan::ExpansionLambda filter_lambda,
query::EvaluationContext evaluation_context,
BfsRpcClients *bfs_subcursor_clients)
: bfs_subcursor_clients_(bfs_subcursor_clients),
expression_evaluator_(&frame_, symbol_table_, evaluation_context_, dba_,
tx_id_(dba->transaction_id()) {
void ExpandBfsSubcursor::Reset() {
pull_index_ = 0;
void ExpandBfsSubcursor::SetSource(storage::VertexAddress source_address) {
auto source = VertexAccessor(source_address, *dba_);
processed_.emplace(source, std::nullopt);
void ExpandBfsSubcursor::PrepareForExpand(
bool clear, std::vector<query::TypedValue> frame) {
if (clear) {
} else {
std::swap(to_visit_current_, to_visit_next_);
bool ExpandBfsSubcursor::ExpandLevel() {
bool expanded = false;
for (const auto &expansion : to_visit_current_) {
expanded |= ExpandFromVertex(expansion.second);
pull_index_ = 0;
return expanded;
std::optional<VertexAccessor> ExpandBfsSubcursor::Pull() {
return pull_index_ < to_visit_next_.size()
? std::make_optional(to_visit_next_[pull_index_++].second)
: std::nullopt;
bool ExpandBfsSubcursor::ExpandToLocalVertex(storage::EdgeAddress edge,
VertexAccessor vertex) {
<< "ExpandToLocalVertex called with remote vertex";
edge = dba_->db().storage().LocalizedAddressIfPossible(edge);
std::lock_guard<std::mutex> lock(mutex_);
auto got = processed_.emplace(vertex, edge);
if (got.second) {
to_visit_next_.emplace_back(edge, vertex);
return got.second;
bool ExpandBfsSubcursor::ExpandToLocalVertex(storage::EdgeAddress edge,
storage::VertexAddress vertex) {
auto vertex_accessor = VertexAccessor(vertex, *dba_);
return ExpandToLocalVertex(edge, VertexAccessor(vertex, *dba_));
PathSegment ExpandBfsSubcursor::ReconstructPath(
storage::EdgeAddress edge_address) {
EdgeAccessor edge(edge_address, *dba_);
CHECK(edge.address().is_local()) << "ReconstructPath called with remote edge";
DCHECK(edge.from_addr().is_local()) << "`from` vertex should always be local";
DCHECK(!edge.to_addr().is_local()) << "`to` vertex should be remote when "
"calling ReconstructPath with edge";
PathSegment result;
ReconstructPathHelper(edge.from(), &result);
return result;
PathSegment ExpandBfsSubcursor::ReconstructPath(
storage::VertexAddress vertex_addr) {
VertexAccessor vertex(vertex_addr, *dba_);
<< "ReconstructPath called with remote vertex";
PathSegment result;
ReconstructPathHelper(vertex, &result);
return result;
void ExpandBfsSubcursor::ReconstructPathHelper(VertexAccessor vertex,
PathSegment *result) {
auto it = processed_.find(vertex);
CHECK(it != processed_.end())
<< "ReconstructPath called with unvisited vertex";
auto in_edge_address = it->second;
while (in_edge_address) {
// In-edge is stored on another worker. It should be returned to master from
// that worker, and path reconstruction should be continued there.
if (in_edge_address->is_remote()) {
result->next_edge = in_edge_address;
result->edges.emplace_back(*in_edge_address, *dba_);
auto &in_edge = result->edges.back();
auto next_vertex_address =
in_edge.from_is(vertex) ? in_edge.to_addr() : in_edge.from_addr();
// We own the in-edge, but the next vertex on the path is stored on another
// worker.
if (next_vertex_address.is_remote()) {
result->next_vertex = next_vertex_address;
vertex = VertexAccessor(next_vertex_address, *dba_);
in_edge_address = processed_[vertex];
bool ExpandBfsSubcursor::ExpandToVertex(EdgeAccessor edge,
VertexAccessor vertex) {
if (filter_lambda_.expression) {
frame_[filter_lambda_.inner_edge_symbol] = edge;
frame_[filter_lambda_.inner_node_symbol] = vertex;
TypedValue result =
if (!result.IsNull() && !result.IsBool()) {
throw query::QueryRuntimeException(
"Expansion condition must evaluate to boolean or null");
if (result.IsNull() || !result.ValueBool()) return false;
return vertex.is_local() ? ExpandToLocalVertex(edge.address(), vertex)
: bfs_subcursor_clients_->ExpandToRemoteVertex(
subcursor_ids_, edge, vertex);
bool ExpandBfsSubcursor::ExpandFromVertex(VertexAccessor vertex) {
bool expanded = false;
if (direction_ != query::EdgeAtom::Direction::IN) {
for (const EdgeAccessor &edge : vertex.out(&edge_types_))
expanded |= ExpandToVertex(edge,;
if (direction_ != query::EdgeAtom::Direction::OUT) {
for (const EdgeAccessor &edge :
expanded |= ExpandToVertex(edge, edge.from());
return expanded;
BfsSubcursorStorage::BfsSubcursorStorage(BfsRpcClients *bfs_subcursor_clients)
: bfs_subcursor_clients_(bfs_subcursor_clients) {}
int64_t BfsSubcursorStorage::Create(
database::GraphDbAccessor *dba, query::EdgeAtom::Direction direction,
std::vector<storage::EdgeType> edge_types, query::SymbolTable symbol_table,
std::unique_ptr<query::AstStorage> ast_storage,
query::plan::ExpansionLambda filter_lambda,
query::EvaluationContext evaluation_context) {
std::lock_guard<std::mutex> lock(mutex_);
int64_t id = next_subcursor_id_++;
auto got = storage_.emplace(
id, std::make_unique<ExpandBfsSubcursor>(
dba, direction, std::move(edge_types), std::move(symbol_table),
std::move(ast_storage), filter_lambda,
std::move(evaluation_context), bfs_subcursor_clients_));
CHECK(got.second) << "Subcursor with ID " << id << " already exists";
return id;
ExpandBfsSubcursor *BfsSubcursorStorage::Get(int64_t subcursor_id) {
std::lock_guard<std::mutex> lock(mutex_);
auto it = storage_.find(subcursor_id);
CHECK(it != storage_.end())
<< "Subcursor with ID " << subcursor_id << " not found";
return it->second.get();
void BfsSubcursorStorage::ClearTransactionalCache(
tx::TransactionId oldest_active) {
// It is unlikely this will become a performance issue, but if it does, we
// should store BFS subcursors in a lock-free map.
std::lock_guard<std::mutex> guard(mutex_);
for (auto it = storage_.begin(); it != storage_.end();) {
if (it->second->tx_id() < oldest_active) {
it = storage_.erase(it);
} else {
} // namespace distributed

View File

@ -1,176 +0,0 @@
/// @file
#pragma once
#include <map>
#include <memory>
#include <unordered_map>
#include "glog/logging.h"
#include "database/distributed/graph_db_accessor.hpp"
#include "query/context.hpp"
#include "query/frontend/semantic/symbol_table.hpp"
#include "query/interpret/eval.hpp"
#include "query/plan/operator.hpp"
namespace database {
class GraphDb;
namespace distributed {
class BfsRpcClients;
/// Path from BFS source to a vertex might span multiple workers. This struct
/// stores information describing segment of a path stored on a worker and
/// information necessary to continue path reconstruction on another worker.
struct PathSegment {
std::vector<EdgeAccessor> edges;
std::optional<storage::VertexAddress> next_vertex;
std::optional<storage::EdgeAddress> next_edge;
/// Class storing the worker-local state of distributed BFS traversal. For each
/// traversal (uniquely identified by cursor id), there is one instance of this
/// class per worker, and those instances communicate via RPC calls.
class ExpandBfsSubcursor {
ExpandBfsSubcursor(database::GraphDbAccessor *dba,
query::EdgeAtom::Direction direction,
std::vector<storage::EdgeType> edge_types,
query::SymbolTable symbol_table,
std::unique_ptr<query::AstStorage> ast_storage,
query::plan::ExpansionLambda filter_lambda,
query::EvaluationContext evaluation_context,
BfsRpcClients *bfs_subcursor_clients);
// Stores subcursor ids of other workers.
void RegisterSubcursors(std::unordered_map<int16_t, int64_t> subcursor_ids) {
subcursor_ids_ = std::move(subcursor_ids);
/// Sets the source to be used for new expansion.
void SetSource(storage::VertexAddress source_address);
/// Notifies the subcursor that a new expansion should take place.
/// `to_visit_next_` must be moved to `to_visit_current_` synchronously for
/// all subcursors participating in expansion to avoid race condition with
/// `ExpandToRemoteVertex` RPC requests. Also used before setting new source
/// with `clear` set to true, to avoid a race condition similar to one
/// described above.
/// @param clear if set to true, `Reset` will be called instead of moving
/// `to_visit_next_`
// @param frame frame for evaluation of filter lambda expression
void PrepareForExpand(bool clear, std::vector<query::TypedValue> frame);
/// Expands the BFS frontier once. Returns true if there was a successful
/// expansion.
bool ExpandLevel();
/// Pulls the next vertex in the current BFS frontier, if there is one.
std::optional<VertexAccessor> Pull();
/// Expands to a local vertex, if it wasn't already visited. Returns true if
/// expansion was successful.
bool ExpandToLocalVertex(storage::EdgeAddress edge, VertexAccessor vertex);
bool ExpandToLocalVertex(storage::EdgeAddress edge,
storage::VertexAddress vertex);
/// Reconstruct the part of path ending with given edge, stored on this
/// worker.
PathSegment ReconstructPath(storage::EdgeAddress edge_address);
/// Reconstruct the part of path to given vertex stored on this worker.
PathSegment ReconstructPath(storage::VertexAddress vertex_addr);
database::GraphDbAccessor *db_accessor() { return dba_; }
tx::TransactionId tx_id() { return tx_id_; }
/// Used to reset subcursor state before starting expansion from new source.
void Reset();
/// Expands to a local or remote vertex, returns true if expansion was
/// successful.
bool ExpandToVertex(EdgeAccessor edge, VertexAccessor vertex);
/// Tries to expand to all vertices connected to given one and returns true if
/// any of them was successful.
bool ExpandFromVertex(VertexAccessor vertex);
/// Helper for path reconstruction doing the actual work.
void ReconstructPathHelper(VertexAccessor vertex, PathSegment *result);
BfsRpcClients *bfs_subcursor_clients_{nullptr};
database::GraphDbAccessor *dba_;
/// IDs of subcursors on other workers, used when sending RPCs.
std::unordered_map<int16_t, int64_t> subcursor_ids_;
query::EdgeAtom::Direction direction_;
std::vector<storage::EdgeType> edge_types_;
/// Symbol table and AstStorage for filter lambda evaluation. If subcursor
/// doesn't own the filter lambda expression, `ast_storage_` is set to
/// nullptr.
query::SymbolTable symbol_table_;
std::unique_ptr<query::AstStorage> ast_storage_;
query::plan::ExpansionLambda filter_lambda_;
/// Evaluation context, frame and expression evaluator for evaluation of
/// filter lambda.
query::EvaluationContext evaluation_context_;
query::Frame frame_;
query::ExpressionEvaluator expression_evaluator_;
/// Mutex protecting `to_visit_next_` and `processed_`, because there is a
/// race between expansions done locally using `ExpandToLocalVertex` and
/// incoming `ExpandToRemoteVertex` RPCs.
std::mutex mutex_;
/// List of visited vertices and their incoming edges. Local address is stored
/// for local edges, global address for remote edges.
std::unordered_map<VertexAccessor, std::optional<storage::EdgeAddress>>
/// List of vertices at the current expansion level.
std::vector<std::pair<storage::EdgeAddress, VertexAccessor>>
/// List of unvisited vertices reachable from current expansion level.
std::vector<std::pair<storage::EdgeAddress, VertexAccessor>> to_visit_next_;
/// Index of the vertex from `to_visit_next_` to return on next pull.
size_t pull_index_;
// Transaction ID used for transactional cache clean-up mechanism.
tx::TransactionId tx_id_;
/// Thread-safe storage for BFS subcursors.
class BfsSubcursorStorage {
explicit BfsSubcursorStorage(BfsRpcClients *bfs_subcursor_clients);
int64_t Create(database::GraphDbAccessor *dba,
query::EdgeAtom::Direction direction,
std::vector<storage::EdgeType> edge_types,
query::SymbolTable symbol_table,
std::unique_ptr<query::AstStorage> ast_storage,
query::plan::ExpansionLambda filter_lambda,
query::EvaluationContext evaluation_context);
ExpandBfsSubcursor *Get(int64_t subcursor_id);
void ClearTransactionalCache(tx::TransactionId oldest_active);
BfsRpcClients *bfs_subcursor_clients_{nullptr};
std::mutex mutex_;
std::map<int64_t, std::unique_ptr<ExpandBfsSubcursor>> storage_;
int64_t next_subcursor_id_{0};
} // namespace distributed

View File

@ -1,23 +0,0 @@
/// @file
#pragma once
#include <memory>
namespace distributed {
/// A wrapper for cached vertex/edge from other machines in the distributed
/// system.
/// @tparam TRecord Vertex or Edge
template <typename TRecord>
struct CachedRecordData {
CachedRecordData(int64_t cypher_id, std::unique_ptr<TRecord> old_record,
std::unique_ptr<TRecord> new_record)
: cypher_id(cypher_id),
new_record(std::move(new_record)) {}
int64_t cypher_id;
std::unique_ptr<TRecord> old_record;
std::unique_ptr<TRecord> new_record;
} // namespace distributed

View File

@ -1,87 +0,0 @@
#include "distributed/cluster_discovery_master.hpp"
#include <filesystem>
#include "distributed/coordination_rpc_messages.hpp"
#include "io/network/endpoint.hpp"
#include "utils/file.hpp"
#include "utils/string.hpp"
namespace distributed {
MasterCoordination *coordination, const std::string &durability_directory)
: coordination_(coordination), durability_directory_(durability_directory) {
coordination_->Register<RegisterWorkerRpc>([this](const auto &endpoint,
auto *req_reader,
auto *res_builder) {
bool registration_successful = false;
bool durability_error = false;
RegisterWorkerReq req;
slk::Load(&req, req_reader);
// Compose the worker's endpoint from its connecting address and its
// advertised port.
io::network::Endpoint worker_endpoint(endpoint.address(), req.port);
// Create and find out what is our durability directory.
auto full_durability_directory =
// Check whether the worker is running on the same host (detected when it
// connects to us over the loopback interface) and whether it has the same
// durability directory as us.
// TODO (mferencevic): This check should also be done for all workers in
// between them because this check only verifies that the worker and master
// don't collide, there can still be a collision between workers.
if ((utils::StartsWith(endpoint.address(), "127.") ||
endpoint.address() == "::1") &&
req.durability_directory == full_durability_directory) {
durability_error = true;
<< "The worker at " << worker_endpoint
<< " was started with the same durability directory as the master!";
// Register the worker if the durability check succeeded.
if (!durability_error) {
registration_successful =
coordination_->RegisterWorker(req.desired_worker_id, worker_endpoint);
// Notify the cluster of the new worker if the registration succeeded.
if (registration_successful) {
void>(0, [req, worker_endpoint](
int worker_id,
communication::rpc::ClientPool &client_pool) {
try {
} catch (const communication::rpc::RpcFailedException &) {
<< "Couldn't notify the cluster of the changed configuration!";
RegisterWorkerRes res(registration_successful, durability_error,
slk::Save(res, res_builder);
[this](auto *req_reader, auto *res_builder) {
NotifyWorkerRecoveredReq req;
slk::Load(&req, req_reader);
NotifyWorkerRecoveredRes res;
slk::Save(res, res_builder);
} // namespace distributed

View File

@ -1,24 +0,0 @@
#pragma once
#include "distributed/coordination_master.hpp"
namespace distributed {
using Server = communication::rpc::Server;
/** Handle cluster discovery on master.
* Cluster discovery on master handles worker registration and broadcasts new
* worker information to already registered workers, and already registered
* worker information to the new worker.
class ClusterDiscoveryMaster final {
ClusterDiscoveryMaster(MasterCoordination *coordination,
const std::string &durability_directory);
MasterCoordination *coordination_;
std::string durability_directory_;
} // namespace distributed

View File

@ -1,65 +0,0 @@
#include "distributed/cluster_discovery_worker.hpp"
#include <filesystem>
#include "distributed/coordination_rpc_messages.hpp"
#include "utils/file.hpp"
namespace distributed {
using Server = communication::rpc::Server;
ClusterDiscoveryWorker::ClusterDiscoveryWorker(WorkerCoordination *coordination)
: coordination_(coordination),
client_pool_(coordination->GetClientPool(0)) {
[this](auto *req_reader, auto *res_builder) {
ClusterDiscoveryReq req;
slk::Load(&req, req_reader);
coordination_->RegisterWorker(req.worker_id, req.endpoint);
ClusterDiscoveryRes res;
slk::Save(res, res_builder);
void ClusterDiscoveryWorker::RegisterWorker(
int worker_id, const std::string &durability_directory) {
// Create and find out what is our durability directory.
auto full_durability_directory =
// Register to the master.
try {
auto result = client_pool_->Call<RegisterWorkerRpc>(
worker_id, coordination_->GetServerEndpoint().port(),
<< "This worker was started on the same machine and with the same "
"durability directory as the master! Please change the durability "
"directory for this worker.";
<< "Unable to assign requested ID (" << worker_id << ") to worker!";
worker_id_ = worker_id;
for (auto &kv : result.workers) {
coordination_->RegisterWorker(kv.first, kv.second);
snapshot_to_recover_ = result.snapshot_to_recover;
} catch (const communication::rpc::RpcFailedException &e) {
LOG(FATAL) << "Couldn't register to the master!";
void ClusterDiscoveryWorker::NotifyWorkerRecovered(
const std::optional<durability::RecoveryInfo> &recovery_info) {
CHECK(worker_id_ >= 0)
<< "Workers id is not yet assigned, preform registration before "
"notifying that the recovery finished";
try {
client_pool_->Call<NotifyWorkerRecoveredRpc>(worker_id_, recovery_info);
} catch (const communication::rpc::RpcFailedException &e) {
LOG(FATAL) << "Couldn't notify the master that we finished recovering!";
} // namespace distributed

View File

@ -1,50 +0,0 @@
#pragma once
#include <optional>
#include "communication/rpc/client_pool.hpp"
#include "communication/rpc/server.hpp"
#include "distributed/coordination_worker.hpp"
#include "durability/distributed/recovery.hpp"
namespace distributed {
/** Handle cluster discovery on worker.
* Cluster discovery on worker handles worker registration by sending an rpc
* request to master and processes received rpc response with other worker
* information.
class ClusterDiscoveryWorker final {
ClusterDiscoveryWorker(WorkerCoordination *coordination);
* Registers a worker with the master.
* @param worker_id - Desired ID. If master can't assign the desired worker
* id, worker will exit.
* @param durability_directory - The durability directory that is used for
* this worker.
void RegisterWorker(int worker_id, const std::string &durability_directory);
* Notifies the master that the worker finished recovering. Assumes that the
* worker was already registered with master.
void NotifyWorkerRecovered(
const std::optional<durability::RecoveryInfo> &recovery_info);
/** Returns the snapshot that should be recovered on workers. Valid only after
* registration. */
auto snapshot_to_recover() const { return snapshot_to_recover_; }
int worker_id_{-1};
distributed::WorkerCoordination *coordination_;
communication::rpc::ClientPool *client_pool_;
std::optional<std::pair<int64_t, tx::TransactionId>> snapshot_to_recover_;
} // namespace distributed

View File

@ -1,93 +0,0 @@
#include "glog/logging.h"
#include <thread>
#include "distributed/coordination.hpp"
namespace distributed {
Coordination::Coordination(const io::network::Endpoint &worker_endpoint,
int worker_id,
const io::network::Endpoint &master_endpoint,
int server_workers_count, int client_workers_count)
: server_(worker_endpoint, &server_context_, server_workers_count),
thread_pool_(client_workers_count, "RPC client") {
if (worker_id != 0) {
// The master is always worker 0.
// We only emplace the master endpoint when this instance isn't the
// `MasterCoordination`. This is because we don't know the exact master
// endpoint until the master server is started. The `MasterCoordination`
// will emplace the master endpoint when the server is started. Eg. if
// `` is supplied as the master endpoint that should be first
// resolved by the server when it binds to that address and
// `server_.endpoint()` should be used.
workers_.emplace(0, master_endpoint);
Coordination::~Coordination() {}
io::network::Endpoint Coordination::GetEndpoint(int worker_id) {
std::lock_guard<std::mutex> guard(lock_);
auto found = workers_.find(worker_id);
// TODO (mferencevic): Handle this error situation differently.
CHECK(found != workers_.end())
<< "No endpoint registered for worker id: " << worker_id;
return found->second;
io::network::Endpoint Coordination::GetServerEndpoint() {
return server_.endpoint();
std::vector<int> Coordination::GetWorkerIds() {
std::lock_guard<std::mutex> guard(lock_);
std::vector<int> worker_ids;
for (auto worker : workers_) worker_ids.push_back(worker.first);
return worker_ids;
std::unordered_map<int, io::network::Endpoint> Coordination::GetWorkers() {
std::lock_guard<std::mutex> guard(lock_);
return workers_;
communication::rpc::ClientPool *Coordination::GetClientPool(int worker_id) {
std::lock_guard<std::mutex> guard(lock_);
auto found = client_pools_.find(worker_id);
if (found != client_pools_.end()) return &found->second;
auto found_endpoint = workers_.find(worker_id);
// TODO (mferencevic): Handle this error situation differently.
CHECK(found_endpoint != workers_.end())
<< "No endpoint registered for worker id: " << worker_id;
auto &endpoint = found_endpoint->second;
return &client_pools_
std::forward_as_tuple(endpoint, &client_context_))
void Coordination::AddWorker(int worker_id,
const io::network::Endpoint &endpoint) {
std::lock_guard<std::mutex> guard(lock_);
workers_.insert({worker_id, endpoint});
std::string Coordination::GetWorkerName(const io::network::Endpoint &endpoint) {
std::lock_guard<std::mutex> guard(lock_);
for (const auto &worker : workers_) {
if (worker.second == endpoint) {
if (worker.first == 0) {
return fmt::format("master ({})", worker.second);
} else {
return fmt::format("worker {} ({})", worker.first, worker.second);
return fmt::format("unknown worker ({})", endpoint);
bool Coordination::IsClusterAlive() { return cluster_alive_; }
} // namespace distributed

View File

@ -1,114 +0,0 @@
#pragma once
#include <functional>
#include <mutex>
#include <thread>
#include <type_traits>
#include <unordered_map>
#include <vector>
#include "communication/rpc/client_pool.hpp"
#include "communication/rpc/server.hpp"
#include "io/network/endpoint.hpp"
#include "utils/future.hpp"
#include "utils/thread.hpp"
namespace distributed {
/// Coordination base class. This class is thread safe.
class Coordination {
Coordination(const io::network::Endpoint &worker_endpoint, int worker_id,
const io::network::Endpoint &master_endpoint,
int server_workers_count = std::thread::hardware_concurrency(),
int client_workers_count = std::thread::hardware_concurrency());
/// Gets the endpoint for the given worker ID from the master.
io::network::Endpoint GetEndpoint(int worker_id);
/// Gets the endpoint for this RPC server.
io::network::Endpoint GetServerEndpoint();
/// Returns all workers id, this includes master (ID 0).
std::vector<int> GetWorkerIds();
/// Gets the mapping of worker id to worker endpoint including master (ID 0).
std::unordered_map<int, io::network::Endpoint> GetWorkers();
/// Returns a cached `ClientPool` for the given `worker_id`.
communication::rpc::ClientPool *GetClientPool(int worker_id);
/// Asynchroniously executes the given function on the rpc client for the
/// given worker id. Returns an `utils::Future` of the given `execute`
/// function's return type.
template <typename TResult>
auto ExecuteOnWorker(
int worker_id,
std::function<TResult(int worker_id, communication::rpc::ClientPool &)>
execute) {
// TODO (mferencevic): Change this lambda to accept a pointer to
// `ClientPool` instead of a reference!
auto client_pool = GetClientPool(worker_id);
return thread_pool_.Run(execute, worker_id, std::ref(*client_pool));
/// Asynchroniously executes the `execute` function on all worker rpc clients
/// except the one whose id is `skip_worker_id`. Returns a vector of futures
/// contaning the results of the `execute` function.
template <typename TResult>
auto ExecuteOnWorkers(
int skip_worker_id,
std::function<TResult(int worker_id, communication::rpc::ClientPool &)>
execute) {
std::vector<utils::Future<TResult>> futures;
// TODO (mferencevic): GetWorkerIds always copies the vector of workers,
// this may be an issue...
for (auto &worker_id : GetWorkerIds()) {
if (worker_id == skip_worker_id) continue;
futures.emplace_back(std::move(ExecuteOnWorker(worker_id, execute)));
return futures;
template <class TRequestResponse>
void Register(std::function<void(slk::Reader *, slk::Builder *)> callback) {
template <class TRequestResponse>
void Register(std::function<void(const io::network::Endpoint &, slk::Reader *,
slk::Builder *)>
callback) {
/// Returns `true` if the cluster is in a consistent state.
bool IsClusterAlive();
/// Adds a worker to the coordination. This function can be called multiple
/// times to replace an existing worker.
void AddWorker(int worker_id, const io::network::Endpoint &endpoint);
/// Gets a worker name for the given endpoint.
std::string GetWorkerName(const io::network::Endpoint &endpoint);
// TODO(mferencevic): distributed is currently hardcoded not to use SSL
communication::ServerContext server_context_;
communication::rpc::Server server_;
std::atomic<bool> cluster_alive_{true};
std::unordered_map<int, io::network::Endpoint> workers_;
mutable std::mutex lock_;
// TODO(mferencevic): distributed is currently hardcoded not to use SSL
communication::ClientContext client_context_;
std::unordered_map<int, communication::rpc::ClientPool> client_pools_;
utils::ThreadPool thread_pool_;
} // namespace distributed

View File

@ -1,233 +0,0 @@
#include <algorithm>
#include <chrono>
#include <thread>
#include "glog/logging.h"
#include "communication/rpc/client.hpp"
#include "distributed/coordination_master.hpp"
#include "distributed/coordination_rpc_messages.hpp"
#include "io/network/utils.hpp"
#include "utils/string.hpp"
namespace distributed {
// Send a heartbeat request to the workers every `kHeartbeatIntervalSeconds`.
// This constant must be at least 10x smaller than `kHeartbeatMaxDelaySeconds`
// that is defined in the worker coordination.
const int kHeartbeatIntervalSeconds = 1;
MasterCoordination::MasterCoordination(const Endpoint &master_endpoint,
int server_workers_count,
int client_workers_count)
: Coordination(master_endpoint, 0, {}, server_workers_count,
client_workers_count) {}
MasterCoordination::~MasterCoordination() {
CHECK(!alive_) << "You must call Shutdown and AwaitShutdown on "
bool MasterCoordination::RegisterWorker(int desired_worker_id,
Endpoint endpoint) {
// Worker's can't register before the recovery phase on the master is done to
// ensure the whole cluster is in a consistent state.
while (true) {
std::lock_guard<std::mutex> guard(master_lock_);
if (recovery_done_) break;
std::lock_guard<std::mutex> guard(master_lock_);
auto workers = GetWorkers();
// Check if the desired worker id already exists.
if (workers.find(desired_worker_id) != workers.end()) {
LOG(WARNING) << "Unable to assign requested ID (" << desired_worker_id
<< ") to worker at: " << endpoint;
// If the desired worker ID is already assigned, return -1 and don't add
// that worker to master coordination.
return false;
AddWorker(desired_worker_id, endpoint);
return true;
void MasterCoordination::WorkerRecoveredSnapshot(
int worker_id,
const std::optional<durability::RecoveryInfo> &recovery_info) {
CHECK(recovered_workers_.insert(std::make_pair(worker_id, recovery_info))
<< "Worker already notified about finishing recovery";
void MasterCoordination::SetRecoveredSnapshot(
std::optional<std::pair<int64_t, tx::TransactionId>>
recovered_snapshot_tx) {
std::lock_guard<std::mutex> guard(master_lock_);
recovery_done_ = true;
recovered_snapshot_tx_ = recovered_snapshot_tx;
int MasterCoordination::CountRecoveredWorkers() const {
return recovered_workers_.size();
std::optional<std::pair<int64_t, tx::TransactionId>>
MasterCoordination::RecoveredSnapshotTx() const {
std::lock_guard<std::mutex> guard(master_lock_);
CHECK(recovery_done_) << "Recovered snapshot requested before it's available";
return recovered_snapshot_tx_;
std::vector<tx::TransactionId> MasterCoordination::CommonWalTransactions(
const durability::RecoveryInfo &master_info) const {
int cluster_size;
std::unordered_map<tx::TransactionId, int> tx_cnt;
for (auto tx : master_info.wal_recovered) {
std::lock_guard<std::mutex> guard(master_lock_);
for (auto worker : recovered_workers_) {
// If there is no recovery info we can just return an empty vector since
// we can't restore any transaction
if (!worker.second) return {};
for (auto tx : worker.second->wal_recovered) {
// Add one because of master
cluster_size = recovered_workers_.size() + 1;
std::vector<tx::TransactionId> tx_intersection;
for (auto tx : tx_cnt) {
if (tx.second == cluster_size) {
return tx_intersection;
bool MasterCoordination::Start() {
if (!server_.Start()) return false;
AddWorker(0, server_.endpoint());
scheduler_.Run("Heartbeat", std::chrono::seconds(kHeartbeatIntervalSeconds),
[this] { IssueHeartbeats(); });
return true;
bool MasterCoordination::AwaitShutdown(
std::function<bool(bool)> call_before_shutdown) {
// Wait for a shutdown notification.
while (alive_) {
// Copy the current value of the cluster state.
bool is_cluster_alive = cluster_alive_;
// Call the before shutdown callback.
bool ret = call_before_shutdown(is_cluster_alive);
// Stop the heartbeat scheduler so we don't cause any errors during shutdown.
// Also, we manually issue one final heartbeat to all workers so that their
// counters are reset. This must be done immediately before issuing shutdown
// requests to the workers. The `IssueHeartbeats` will ignore any errors that
// occur now because we are in the process of shutting the cluster down.
// Shutdown all workers.
auto workers = GetWorkers();
std::vector<std::pair<int, io::network::Endpoint>> workers_sorted(
workers.begin(), workers.end());
std::sort(workers_sorted.begin(), workers_sorted.end(),
[](const std::pair<int, io::network::Endpoint> &a,
const std::pair<int, io::network::Endpoint> &b) {
return a.first < b.first;
LOG(INFO) << "Starting shutdown of all workers.";
for (const auto &worker : workers_sorted) {
// Skip master (self).
if (worker.first == 0) continue;
auto client_pool = GetClientPool(worker.first);
try {
} catch (const communication::rpc::RpcFailedException &e) {
LOG(WARNING) << "Couldn't shutdown " << GetWorkerName(e.endpoint());
// Make sure all workers have died.
while (true) {
std::vector<std::string> workers_alive;
for (const auto &worker : workers_sorted) {
// Skip master (self).
if (worker.first == 0) continue;
if (io::network::CanEstablishConnection(worker.second)) {
if (workers_alive.size() == 0) break;
LOG(INFO) << "Waiting for " << utils::Join(workers_alive, ", ")
<< " to finish shutting down...";
LOG(INFO) << "Shutdown of all workers is complete.";
// Some RPC servers might still depend on the cluster status to shut down. At
// this point all workers are down which means that the cluster is also not
// alive any more.;
// Shutdown our RPC server.
// Return `true` if the cluster is alive and the `call_before_shutdown`
// succeeded.
return ret && is_cluster_alive;
void MasterCoordination::Shutdown() {; }
void MasterCoordination::IssueHeartbeats() {
std::lock_guard<std::mutex> guard(master_lock_);
auto workers = GetWorkers();
for (const auto &worker : workers) {
// Skip master (self).
if (worker.first == 0) continue;
auto client_pool = GetClientPool(worker.first);
try {
// TODO (mferencevic): Should we retry this call to ignore some transient
// communication errors?
} catch (const communication::rpc::RpcFailedException &e) {
// If we are not alive that means that we are in the process of a
// shutdown. We ignore any exceptions here to stop our Heartbeat from
// displaying warnings that the workers may have died (they should die,
// we are shutting them down). Note: The heartbeat scheduler must stay
// alive to ensure that the workers receive their heartbeat requests
// during shutdown (which may take a long time).
if (!alive_) continue;
LOG(WARNING) << "The " << GetWorkerName(e.endpoint())
<< " didn't respond to our heartbeat request. The cluster "
"is in a degraded state and we are starting a graceful "
"shutdown. Please check the logs on the worker for "
"more details.";
// Set the `cluster_alive_` flag to `false` to indicate that something
// in the cluster failed.;
// Shutdown the whole cluster.
} // namespace distributed

View File

@ -1,102 +0,0 @@
#pragma once
#include <atomic>
#include <functional>
#include <mutex>
#include <optional>
#include <set>
#include <unordered_map>
#include "distributed/coordination.hpp"
#include "durability/distributed/recovery.hpp"
#include "io/network/endpoint.hpp"
#include "utils/scheduler.hpp"
namespace distributed {
using Endpoint = io::network::Endpoint;
/** Handles worker registration, getting of other workers' endpoints and
* coordinated shutdown in a distributed memgraph. Master side. */
class MasterCoordination final : public Coordination {
explicit MasterCoordination(
const Endpoint &master_endpoint,
int server_workers_count = std::thread::hardware_concurrency(),
int client_workers_count = std::thread::hardware_concurrency());
MasterCoordination(const MasterCoordination &) = delete;
MasterCoordination(MasterCoordination &&) = delete;
MasterCoordination &operator=(const MasterCoordination &) = delete;
MasterCoordination &operator=(MasterCoordination &&) = delete;
/** Registers a new worker with this master coordination.
* @param desired_worker_id - The ID the worker would like to have.
* @return True if the desired ID for the worker is available, or false
* if the desired ID is already taken.
bool RegisterWorker(int desired_worker_id, Endpoint endpoint);
* Worker `worker_id` finished with recovering, adds it to the set of
* recovered workers alongside with its recovery_info.
void WorkerRecoveredSnapshot(
int worker_id,
const std::optional<durability::RecoveryInfo> &recovery_info);
/// Sets the recovery info. nullopt indicates nothing was recovered.
void SetRecoveredSnapshot(
std::optional<std::pair<int64_t, tx::TransactionId>> recovered_snapshot);
std::optional<std::pair<int64_t, tx::TransactionId>> RecoveredSnapshotTx()
int CountRecoveredWorkers() const;
std::vector<tx::TransactionId> CommonWalTransactions(
const durability::RecoveryInfo &master_info) const;
/// Starts the coordination and its servers.
bool Start();
/// Waits while the cluster is in a valid state or the `Shutdown` method is
/// called (suitable for use with signal handlers). Blocks the calling thread
/// until that has finished.
/// @param call_before_shutdown function that should be called before
/// shutdown, the function gets a bool argument indicating whether the cluster
/// is alive and should return a bool indicating whether the shutdown
/// succeeded without any issues
/// @returns `true` if the shutdown was completed without any issues, `false`
/// otherwise
bool AwaitShutdown(std::function<bool(bool)> call_before_shutdown =
[](bool is_cluster_alive) -> bool { return true; });
/// Hints that the coordination should start shutting down the whole cluster.
void Shutdown();
/// Sends a heartbeat request to all workers.
void IssueHeartbeats();
// Most master functions aren't thread-safe.
mutable std::mutex master_lock_;
// Durabilility recovery info.
// Indicates if the recovery phase is done.
bool recovery_done_{false};
// Set of workers that finished sucesfully recovering snapshot
std::map<int, std::optional<durability::RecoveryInfo>> recovered_workers_;
// If nullopt nothing was recovered.
std::optional<std::pair<int64_t, tx::TransactionId>> recovered_snapshot_tx_;
// Scheduler that is used to periodically ping all registered workers.
utils::Scheduler scheduler_;
// Flags used for shutdown.
std::atomic<bool> alive_{true};
} // namespace distributed

View File

@ -1,47 +0,0 @@
#pragma once
#include <optional>
#include <unordered_map>
#include "communication/rpc/messages.hpp"
#include "durability/distributed/recovery.hpp"
#include "durability/distributed/serialization.hpp"
#include "io/network/endpoint.hpp"
#include "io/network/serialization.hpp"
(lcp:namespace distributed)
(lcp:define-rpc register-worker
((desired-worker-id :int16_t)
(port :uint16_t)
(durability-directory "std::string")))
((registration-successful :bool)
(durability-error :bool)
(snapshot-to-recover "std::optional<std::pair<int64_t, tx::TransactionId>>")
(workers "std::unordered_map<int, io::network::Endpoint>"))))
(lcp:define-rpc cluster-discovery
((worker-id :int16_t)
(endpoint "::io::network::Endpoint")))
(:response ()))
(lcp:define-rpc stop-worker
(:request ())
(:response ()))
(lcp:define-rpc notify-worker-recovered
((worker-id :int16_t)
(recovery-info "std::optional<durability::RecoveryInfo>")))
(:response ()))
(lcp:define-rpc heartbeat
(:request ())
(:response ()))
(lcp:pop-namespace) ;; distributed

View File

@ -1,109 +0,0 @@
#include <chrono>
#include <mutex>
#include <thread>
#include "glog/logging.h"
#include "distributed/coordination_rpc_messages.hpp"
#include "distributed/coordination_worker.hpp"
namespace distributed {
// Expect that a heartbeat should be received in this time interval. If it is
// not received we assume that the communication is broken and start a shutdown.
const int kHeartbeatMaxDelaySeconds = 10;
// Check whether a heartbeat is received every `kHeartbeatCheckSeconds`. It
// should be larger than `kHeartbeatIntervalSeconds` defined in the master
// coordination because it makes no sense to check more often than the heartbeat
// is sent. Also, it must be smaller than `kHeartbeatMaxDelaySeconds` to
// function properly.
const int kHeartbeatCheckSeconds = 2;
using namespace std::chrono_literals;
const io::network::Endpoint &worker_endpoint, int worker_id,
const io::network::Endpoint &master_endpoint, int server_workers_count,
int client_workers_count)
: Coordination(worker_endpoint, worker_id, master_endpoint,
server_workers_count, client_workers_count) {
[&](auto *req_reader, auto *res_builder) {
LOG(INFO) << "The master initiated shutdown of this worker.";
server_.Register<HeartbeatRpc>([&](auto *req_reader,
auto *res_builder) {
std::lock_guard<std::mutex> guard(heartbeat_lock_);
last_heartbeat_time_ = std::chrono::steady_clock::now();
if (!scheduler_.IsRunning()) {
"Heartbeat", std::chrono::seconds(kHeartbeatCheckSeconds), [this] {
std::lock_guard<std::mutex> guard(heartbeat_lock_);
auto duration =
std::chrono::steady_clock::now() - last_heartbeat_time_;
if (duration > std::chrono::seconds(kHeartbeatMaxDelaySeconds)) {
LOG(WARNING) << "The master hasn't given us a heartbeat request "
"for at least "
<< kHeartbeatMaxDelaySeconds
<< " seconds! We are shutting down...";
// Set the `cluster_alive_` flag to `false` to indicate that
// something in the cluster failed.
cluster_alive_ = false;
// Shutdown the worker.
WorkerCoordination::~WorkerCoordination() {
CHECK(!alive_) << "You must call Shutdown and AwaitShutdown on "
void WorkerCoordination::RegisterWorker(int worker_id,
io::network::Endpoint endpoint) {
AddWorker(worker_id, endpoint);
bool WorkerCoordination::Start() {
return server_.Start();
bool WorkerCoordination::AwaitShutdown(
std::function<bool(bool)> call_before_shutdown) {
// Wait for a shutdown notification.
while (alive_) {
// The first thing we need to do is to stop our heartbeat scheduler because
// the master stopped their scheduler immediately before issuing the shutdown
// request to the worker. This will prevent our heartbeat from timing out on a
// regular shutdown.
// Copy the current value of the cluster state.
bool is_cluster_alive = cluster_alive_;
// Call the before shutdown callback.
bool ret = call_before_shutdown(is_cluster_alive);
// Shutdown our RPC server.
// All other cleanup must be done here.
// Return `true` if the cluster is alive and the `call_before_shutdown`
// succeeded.
return ret && is_cluster_alive;
void WorkerCoordination::Shutdown() {; }
} // namespace distributed

View File

@ -1,61 +0,0 @@
#pragma once
#include <atomic>
#include <mutex>
#include <unordered_map>
#include "communication/rpc/server.hpp"
#include "distributed/coordination.hpp"
#include "utils/scheduler.hpp"
namespace distributed {
/// Handles worker registration, getting of other workers' endpoints and
/// coordinated shutdown in a distributed memgraph. Worker side.
class WorkerCoordination final : public Coordination {
const io::network::Endpoint &worker_endpoint, int worker_id,
const io::network::Endpoint &master_endpoint,
int server_workers_count = std::thread::hardware_concurrency(),
int client_workers_count = std::thread::hardware_concurrency());
WorkerCoordination(const WorkerCoordination &) = delete;
WorkerCoordination(WorkerCoordination &&) = delete;
WorkerCoordination &operator=(const WorkerCoordination &) = delete;
WorkerCoordination &operator=(WorkerCoordination &&) = delete;
/// Registers the worker with the given endpoint.
void RegisterWorker(int worker_id, io::network::Endpoint endpoint);
/// Starts the coordination and its servers.
bool Start();
/// Starts listening for a remote shutdown command (issued by the master) or
/// for the `Shutdown` method to be called (suitable for use with signal
/// handlers). Blocks the calling thread until that has finished.
/// @param call_before_shutdown function that should be called before
/// shutdown, the function gets a bool argument indicating whether the cluster
/// is alive and should return a bool indicating whether the shutdown
/// succeeded without any issues
/// @returns `true` if the shutdown was completed without any issues, `false`
/// otherwise
bool AwaitShutdown(std::function<bool(bool)> call_before_shutdown =
[](bool is_cluster_alive) -> bool { return true; });
/// Hints that the coordination should start shutting down the worker.
void Shutdown();
// Heartbeat variables
std::mutex heartbeat_lock_;
std::chrono::time_point<std::chrono::steady_clock> last_heartbeat_time_;
utils::Scheduler scheduler_;
// Flag used for shutdown.
std::atomic<bool> alive_{true};
std::atomic<bool> cluster_alive_{true};
} // namespace distributed

View File

@ -1,97 +0,0 @@
#include "distributed/data_manager.hpp"
#include "storage/distributed/storage.hpp"
namespace {
template <typename TCache>
void ClearCache(TCache &cache, tx::TransactionId tx_id) {
auto access = cache.access();
auto found = access.find(tx_id);
if (found != access.end()) found->second.Clear();
template <typename TCache>
void DeleteOld(TCache &cache, tx::TransactionId oldest_active) {
auto access = cache.access();
for (auto &kv : access) {
if (kv.first < oldest_active) {
} // anonymous namespace
namespace distributed {
template <>
DataManager::CacheT<Vertex> &DataManager::caches<Vertex>() {
return vertices_caches_;
template <>
DataManager::CacheT<Edge> &DataManager::caches<Edge>() {
return edges_caches_;
template <>
size_t DataManager::GetInitSize<Vertex>() const {
return vertex_cache_size_;
template <>
size_t DataManager::GetInitSize<Edge>() const {
return edge_cache_size_;
DataManager::DataManager(database::GraphDb &db,
distributed::DataRpcClients &data_clients,
size_t vertex_cache_size, size_t edge_cache_size)
: vertex_cache_size_(vertex_cache_size),
data_clients_(data_clients) {}
std::mutex &DataManager::GetLock(tx::TransactionId tx_id) {
auto accessor = lock_store_.access();
auto found = accessor.find(tx_id);
if (found != accessor.end()) return found->second;
// By passing empty tuple default constructor is used
// and std::mutex is created in ConcurrentMap.
return accessor.emplace(tx_id, std::make_tuple(tx_id), std::make_tuple())
template <>
void DataManager::LocalizeAddresses<Vertex>(Vertex &vertex) {
auto localize_edges = [this](auto &edges) {
for (auto &element : edges) {
element.vertex =;
element.edge =;
template <>
void DataManager::LocalizeAddresses(Edge &edge) {
edge.from_ =;
edge.to_ =;
void DataManager::ClearCacheForSingleTransaction(tx::TransactionId tx_id) {
ClearCache(vertices_caches_, tx_id);
ClearCache(edges_caches_, tx_id);
void DataManager::ClearTransactionalCache(tx::TransactionId oldest_active) {
DeleteOld(vertices_caches_, oldest_active);
DeleteOld(edges_caches_, oldest_active);
DeleteOld(lock_store_, oldest_active);
} // namespace distributed

View File

@ -1,126 +0,0 @@
/// @file
#pragma once
#include "data_structures/concurrent/concurrent_map.hpp"
#include "database/distributed/graph_db.hpp"
#include "distributed/cached_record_data.hpp"
#include "distributed/data_rpc_clients.hpp"
#include "transactions/type.hpp"
#include "utils/cache.hpp"
class Vertex;
class Edge;
namespace distributed {
/// Handles remote data caches for edges and vertices, per transaction.
class DataManager {
template <typename TRecord>
using CacheG =
utils::LruCache<gid::Gid, std::shared_ptr<CachedRecordData<TRecord>>>;
template <typename TRecord>
using CacheT = ConcurrentMap<tx::TransactionId, CacheG<TRecord>>;
DataManager(database::GraphDb &db, distributed::DataRpcClients &data_clients,
size_t vertex_cache_size, size_t edge_cache_size);
/// Finds cached element for the given transaction, worker and gid.
/// @tparam TRecord Vertex or Edge
template <typename TRecord>
std::shared_ptr<CachedRecordData<TRecord>> Find(tx::TransactionId tx_id,
int from_worker_id,
int worker_id, gid::Gid gid,
bool to_update = false) {
auto &cache = GetCache<TRecord>(tx_id);
std::unique_lock<std::mutex> guard(GetLock(tx_id));
auto found = cache.Find(gid);
if (found) {
auto data = *found;
if (to_update && !data->new_record) {
return data;
} else {
auto remote = data_clients_.RemoteElement<TRecord>(from_worker_id,
worker_id, tx_id, gid);
if (remote.old_record_ptr) LocalizeAddresses(*remote.old_record_ptr);
if (remote.new_record_ptr) LocalizeAddresses(*remote.new_record_ptr);
if (to_update && !remote.new_record_ptr) {
auto data =
remote.cypher_id, std::move(remote.old_record_ptr),
cache.Insert(gid, data);
return data;
/// Sets the given records as (new, old) data for the given gid.
template <typename TRecord>
void Emplace(tx::TransactionId tx_id, gid::Gid gid,
CachedRecordData<TRecord> data) {
std::lock_guard<std::mutex> guard(GetLock(tx_id));
// We can't replace existing data because some accessors might be using
// it.
// TODO - consider if it's necessary and OK to copy just the data content.
auto &cache = GetCache<TRecord>(tx_id);
auto found = cache.Find(gid);
if (!found) {
if (data.old_record) LocalizeAddresses(*data.old_record);
if (data.new_record) LocalizeAddresses(*data.new_record);
cache.Insert(gid, std::make_shared<CachedRecordData<TRecord>>(std::move(data)));
/// Removes all the caches for a single transaction.
void ClearCacheForSingleTransaction(tx::TransactionId tx_id);
/// Clears the cache of local transactions that have expired. The signature of
/// this method is dictated by `distributed::TransactionalCacheCleaner`.
void ClearTransactionalCache(tx::TransactionId oldest_active);
template <typename TRecord>
void LocalizeAddresses(TRecord &record);
template <typename TRecord>
size_t GetInitSize() const;
template <typename TRecord>
CacheG<TRecord> &GetCache(tx::TransactionId tx_id) {
auto accessor = caches<TRecord>().access();
auto found = accessor.find(tx_id);
if (found != accessor.end()) return found->second;
return accessor
.emplace(tx_id, std::make_tuple(tx_id),
std::mutex &GetLock(tx::TransactionId tx_id);
template <typename TRecord>
CacheT<TRecord> &caches();
size_t vertex_cache_size_;
size_t edge_cache_size_;
database::GraphDb &db_;
DataRpcClients &data_clients_;
ConcurrentMap<tx::TransactionId, std::mutex> lock_store_;
CacheT<Vertex> vertices_caches_;
CacheT<Edge> edges_caches_;
} // namespace distributed

View File

@ -1,53 +0,0 @@
#include "distributed/data_rpc_clients.hpp"
#include <unordered_map>
#include "distributed/data_rpc_messages.hpp"
#include "storage/distributed/edge.hpp"
#include "storage/distributed/vertex.hpp"
namespace distributed {
template <>
RemoteElementInfo<Edge> DataRpcClients::RemoteElement(int from_worker_id,
int worker_id,
tx::TransactionId tx_id,
gid::Gid gid) {
auto response = coordination_->GetClientPool(worker_id)->Call<EdgeRpc>(
TxGidPair{tx_id, gid, from_worker_id});
return RemoteElementInfo<Edge>(response.cypher_id,
template <>
RemoteElementInfo<Vertex> DataRpcClients::RemoteElement(int from_worker_id,
int worker_id,
tx::TransactionId tx_id,
gid::Gid gid) {
auto response = coordination_->GetClientPool(worker_id)->Call<VertexRpc>(
TxGidPair{tx_id, gid, from_worker_id});
return RemoteElementInfo<Vertex>(response.cypher_id,
std::unordered_map<int, int64_t> DataRpcClients::VertexCounts(
tx::TransactionId tx_id) {
auto future_results = coordination_->ExecuteOnWorkers<std::pair<int, int64_t>>(
-1, [tx_id](int worker_id, communication::rpc::ClientPool &client_pool) {
auto response = client_pool.Call<VertexCountRpc>(tx_id);
return std::make_pair(worker_id, response.member);
std::unordered_map<int, int64_t> results;
for (auto &result : future_results) {
auto result_pair = result.get();
int worker = result_pair.first;
int vertex_count = result_pair.second;
results[worker] = vertex_count;
return results;
} // namespace distributed

View File

@ -1,61 +0,0 @@
/// @file
#pragma once
#include <memory>
#include <mutex>
#include <unordered_map>
#include <utility>
#include "distributed/coordination.hpp"
#include "storage/distributed/gid.hpp"
#include "transactions/type.hpp"
namespace distributed {
class RpcWorkerClients;
template <typename TRecord>
struct RemoteElementInfo {
RemoteElementInfo() = delete;
RemoteElementInfo(const RemoteElementInfo &) = delete;
// TODO (buda): The default move constructor should be deleted but it seems
// that clang-3.9 doesn't know how to do RVO when this struct is used.
RemoteElementInfo(RemoteElementInfo &&) = default;
RemoteElementInfo &operator=(const RemoteElementInfo &) = delete;
RemoteElementInfo &operator=(RemoteElementInfo &&) = delete;
RemoteElementInfo(int64_t cypher_id, std::unique_ptr<TRecord> old_record_ptr,
std::unique_ptr<TRecord> new_record_ptr)
: cypher_id(cypher_id),
new_record_ptr(std::move(new_record_ptr)) {}
int64_t cypher_id;
std::unique_ptr<TRecord> old_record_ptr;
std::unique_ptr<TRecord> new_record_ptr;
/// Provides access to other worker's data.
class DataRpcClients {
explicit DataRpcClients(Coordination *coordination)
: coordination_(coordination) {}
/// Returns a remote worker's record (vertex/edge) data for the given params.
/// That worker must own the vertex/edge for the given id, and that vertex
/// must be visible in given transaction.
template <typename TRecord>
RemoteElementInfo<TRecord> RemoteElement(int from_worker_id, int worker_id,
tx::TransactionId tx_id,
gid::Gid gid);
/// Returns (worker_id, vertex_count) for each worker and the number of
/// vertices on it from the perspective of transaction `tx_id`.
std::unordered_map<int, int64_t> VertexCounts(tx::TransactionId tx_id);
Coordination *coordination_;
} // namespace distributed

View File

@ -1,125 +0,0 @@
#pragma once
#include <memory>
#include <string>
#include "communication/rpc/messages.hpp"
#include "storage/distributed/edge.hpp"
#include "storage/distributed/gid.hpp"
#include "storage/distributed/rpc/serialization.hpp"
#include "storage/distributed/vertex.hpp"
#include "transactions/type.hpp"
(lcp:namespace distributed)
(lcp:define-struct tx-gid-pair ()
((tx-id "::tx::TransactionId")
(gid "::gid::Gid")
(from-worker-id :int64_t))
(:serialize (:slk)))
(lcp:define-rpc vertex
(:request ((member "TxGidPair")))
((cypher-id :int64_t)
(vertex-old-input "const Vertex *"
(lambda (member)
bool has_ptr = self.${member};
slk::Save(has_ptr, builder);
if (has_ptr) {
slk::Save(*self.${member}, builder, self.worker_id);
(lambda (member)
(declare (ignore member))
bool has_ptr;
slk::Load(&has_ptr, reader);
if (has_ptr) {
self->vertex_old_output = std::make_unique<Vertex>();
slk::Load(self->vertex_old_output.get(), reader);
(vertex-new-input "const Vertex *"
(lambda (member)
bool has_ptr = self.${member};
slk::Save(has_ptr, builder);
if (has_ptr) {
slk::Save(*self.${member}, builder, self.worker_id);
(lambda (member)
(declare (ignore member))
bool has_ptr;
slk::Load(&has_ptr, reader);
if (has_ptr) {
self->vertex_new_output = std::make_unique<Vertex>();
slk::Load(self->vertex_new_output.get(), reader);
(worker-id :int64_t :dont-save t)
(vertex-old-output "std::unique_ptr<Vertex>" :initarg nil :dont-save t)
(vertex-new-output "std::unique_ptr<Vertex>" :initarg nil :dont-save t))))
(lcp:define-rpc edge
(:request ((member "TxGidPair")))
((cypher-id :int64_t)
(edge-old-input "const Edge *"
(lambda (member)
bool has_ptr = self.${member};
slk::Save(has_ptr, builder);
if (has_ptr) {
slk::Save(*self.${member}, builder, self.worker_id);
(lambda (member)
(declare (ignore member))
bool has_ptr;
slk::Load(&has_ptr, reader);
if (has_ptr) {
slk::Load(&self->edge_old_output, reader);
(edge-new-input "const Edge *"
(lambda (member)
bool has_ptr = self.${member};
slk::Save(has_ptr, builder);
if (has_ptr) {
slk::Save(*self.${member}, builder, self.worker_id);
(lambda (member)
(declare (ignore member))
bool has_ptr;
slk::Load(&has_ptr, reader);
if (has_ptr) {
slk::Load(&self->edge_old_output, reader);
(worker-id :int64_t :dont-save t)
(edge-old-output "std::unique_ptr<Edge>" :initarg nil :dont-save t)
(edge-new-output "std::unique_ptr<Edge>" :initarg nil :dont-save t))))
(lcp:define-rpc vertex-count
(:request ((member "::tx::TransactionId")))
(:response ((member :int64_t))))
(lcp:pop-namespace) ;; distributed

View File

@ -1,63 +0,0 @@
#include "distributed/data_rpc_server.hpp"
#include <memory>
#include "database/distributed/graph_db.hpp"
#include "database/distributed/graph_db_accessor.hpp"
#include "distributed/updates_rpc_server.hpp"
#include "distributed/data_rpc_messages.hpp"
namespace distributed {
DataRpcServer::DataRpcServer(database::GraphDb *db,
distributed::Coordination *coordination)
: db_(db) {
[this](auto *req_reader, auto *res_builder) {
VertexReq req;
slk::Load(&req, req_reader);
auto dba = db_->Access(req.member.tx_id);
auto vertex = dba->FindVertexRaw(req.member.gid);
auto *old = vertex.GetOld();
auto *newr = vertex.GetNew() ? vertex.GetNew()->CloneData() : nullptr;
dba->transaction().id_, req.member.gid,
req.member.from_worker_id, &old, &newr);
VertexRes response(vertex.CypherId(), old, newr, db_->WorkerId());
slk::Save(response, res_builder);
delete newr;
[this](auto *req_reader, auto *res_builder) {
EdgeReq req;
slk::Load(&req, req_reader);
auto dba = db_->Access(req.member.tx_id);
auto edge = dba->FindEdgeRaw(req.member.gid);
auto *old = edge.GetOld();
auto *newr = edge.GetNew() ? edge.GetNew()->CloneData() : nullptr;
dba->transaction().id_, req.member.gid,
req.member.from_worker_id, &old, &newr);
EdgeRes response(edge.CypherId(), old, newr, db_->WorkerId());
slk::Save(response, res_builder);
delete newr;
[this](auto *req_reader, auto *res_builder) {
VertexCountReq req;
slk::Load(&req, req_reader);
auto dba = db_->Access(req.member);
int64_t size = 0;
for (auto vertex : dba->Vertices(false)) ++size;
VertexCountRes res(size);
slk::Save(res, res_builder);
} // namespace distributed

View File

@ -1,22 +0,0 @@
#pragma once
#include "database/distributed/graph_db.hpp"
#include "distributed/coordination.hpp"
namespace database {
class GraphDb;
namespace distributed {
/// Serves this worker's data to others.
class DataRpcServer {
DataRpcServer(database::GraphDb *db,
distributed::Coordination *coordination);
database::GraphDb *db_;
} // namespace distributed

View File

@ -1,169 +0,0 @@
#include "distributed/dgp/partitioner.hpp"
#include <algorithm>
#include <unordered_map>
#include <vector>
#include "database/distributed/graph_db.hpp"
#include "database/distributed/graph_db_accessor.hpp"
#include "distributed/updates_rpc_clients.hpp"
#include "query/exceptions.hpp"
#include "distributed/dgp/vertex_migrator.hpp"
#include "utils/flag_validation.hpp"
#include "utils/thread/sync.hpp"
// TODO (buda): Implement openCypher commands to control these parameters.
dgp_improvement_threshold, 10,
"How much better should specific node score be to consider "
"a migration to another worker. This represents the minimal difference "
"between new score that the vertex will have when migrated and the old one "
"such that it's migrated.",
FLAG_IN_RANGE(1, 100));
// TODO (buda): The default here should be int_max because that will allow us to
// partition large dataset faster. It should be used for our tests where we can
// run the partitioning up front.
DEFINE_VALIDATED_int32(dgp_max_batch_size, 2000,
"Maximal amount of vertices which should be migrated in "
"one dynamic graph partitioner step.",
FLAG_IN_RANGE(1, std::numeric_limits<int32_t>::max()));
namespace distributed::dgp {
Partitioner::Partitioner(database::GraphDb *db) : db_(db) {}
std::pair<double, bool> Partitioner::Partition() {
auto failed_partitioning_data =
std::make_pair(std::numeric_limits<double>::min(), false);
// Note, in distributed system TxBegin can throw because the server that
// assigns transaction numbers might be unavailable.
try {
auto dba = db_->Access();
VLOG(21) << "Starting DynamicGraphPartitioner in tx: "
<< dba->transaction().id_;
try {
auto data = FindMigrations(*dba);
VertexMigrator migrator(dba.get());
for (auto &migration : data.migrations) {
migrator.MigrateVertex(migration.first, migration.second);
auto apply_futures = db_->updates_clients().UpdateApplyAll(
db_->WorkerId(), dba->transaction().id_);
for (auto &future : apply_futures) {
switch (future.get()) {
case distributed::UpdateResult::SERIALIZATION_ERROR:
throw mvcc::SerializationError(
"Failed to relocate vertex due to SerializationError");
case distributed::UpdateResult::UNABLE_TO_DELETE_VERTEX_ERROR:
throw query::RemoveAttachedVertexException();
case distributed::UpdateResult::UPDATE_DELETED_ERROR:
throw query::QueryRuntimeException(
"Failed to apply deferred updates due to RecordDeletedError");
case distributed::UpdateResult::LOCK_TIMEOUT_ERROR:
throw utils::LockTimeoutException(
"Failed to apply deferred update due to LockTimeoutException");
case distributed::UpdateResult::DONE:
VLOG(21) << "Sucesfully migrated " << data.migrations.size()
<< " vertices with score " << data.score << ".";
return std::make_pair(data.score, true);
} catch (const utils::BasicException &e) {
VLOG(21) << "Didn't succeed in relocating; " << e.what();
// Returning VertexAccessors after Abort might not be a good idea. + The
// returned migrations are entirely useless because the engine didn't
// succeed to migrate anything.
return failed_partitioning_data;
} catch (const communication::rpc::RpcFailedException &e) {
// Transaction start failed because BeginRpc failed. Nothing to cleanup.
// Any other RpcFailedExceptions should be handeled in the inner try block.
VLOG(21) << "Failed to start DGP transaction; " << e.what();
return failed_partitioning_data;
} catch (const std::exception &e) {
LOG(FATAL) << "Unhandled exception during partitioning. " << e.what();
MigrationsData Partitioner::FindMigrations(database::GraphDbAccessor &dba) {
// Find workers vertex count
std::unordered_map<int, int64_t> worker_vertex_count =
// TODO (buda): Add total edge count as an option.
int64_t total_vertex_count = 0;
for (auto worker_vertex_count_pair : worker_vertex_count) {
total_vertex_count += worker_vertex_count_pair.second;
double average_vertex_count =
total_vertex_count * 1.0 / worker_vertex_count.size();
if (average_vertex_count == 0)
return MigrationsData(std::numeric_limits<double>::min());
double local_graph_score = 0;
// Considers all migrations which maximally improve single vertex score
std::vector<std::pair<VertexAccessor, int>> migrations;
for (const auto &vertex : dba.Vertices(false)) {
auto label_counts = CountLabels(vertex);
std::unordered_map<int, double> per_label_score;
size_t degree = vertex.in_degree() + vertex.out_degree();
if (degree == 0) continue;
for (auto worker_vertex_count_pair : worker_vertex_count) {
int worker = worker_vertex_count_pair.first;
int64_t worker_vertex_count = worker_vertex_count_pair.second;
per_label_score[worker] =
label_counts[worker] * 1.0 / degree -
worker_vertex_count * 1.0 / average_vertex_count;
auto label_cmp = [](const std::pair<int, double> &p1,
const std::pair<int, double> &p2) {
return p1.second < p2.second;
auto best_label = std::max_element(per_label_score.begin(),
per_label_score.end(), label_cmp);
local_graph_score += best_label->second;
// Consider as a migration only if the improvement is high enough
if (best_label != per_label_score.end() &&
best_label->first != db_->WorkerId() &&
per_label_score[best_label->first] -
FLAGS_dgp_improvement_threshold / 100.0 >=
per_label_score[db_->WorkerId()]) {
migrations.emplace_back(vertex, best_label->first);
if (migrations.size() >= FLAGS_dgp_max_batch_size) break;
DLOG(INFO) << "Local graph score: " << local_graph_score;
return MigrationsData(local_graph_score, std::move(migrations));
std::unordered_map<int, int64_t> Partitioner::CountLabels(
const VertexAccessor &vertex) const {
std::unordered_map<int, int64_t> label_count;
for (auto edge : {
auto address = edge.from().address();
auto label = address.is_remote() ? address.worker_id() : db_->WorkerId();
for (auto edge : vertex.out()) {
auto address =;
auto label = address.is_remote() ? address.worker_id() : db_->WorkerId();
return label_count;
} // namespace distributed::dgp

View File

@ -1,89 +0,0 @@
/// @file
#pragma once
#include <thread>
#include "distributed/data_rpc_clients.hpp"
#include "distributed/token_sharing_rpc_messages.hpp"
#include "distributed/dgp/vertex_migrator.hpp"
#include "storage/vertex_accessor.hpp"
namespace database {
class GraphDb;
class GraphDbAccessor;
}; // namespace database
namespace distributed::dgp {
/// Contains a set of vertices and where they should be migrated
/// (machine/instance id) + score how good the partitioning is.
struct MigrationsData {
using Migrations = std::vector<std::pair<VertexAccessor, int>>;
MigrationsData(double score, Migrations migrations = Migrations())
: score(std::move(score)), migrations(std::move(migrations)) {}
/// Disable copying because the number of migrations could be huge. The
/// expected number is 1k, but a user can configure the database in a way
/// where the number of migrations could be much higher.
MigrationsData(const MigrationsData &other) = delete;
MigrationsData &operator=(const MigrationsData &other) = delete;
MigrationsData(MigrationsData &&other) = default;
MigrationsData &operator=(MigrationsData &&other) = default;
double score;
Migrations migrations;
/// Handles dynamic graph partitions, migrates vertices from one worker to
/// another based on available scoring which takes into account neighbours of a
/// vertex and tries to put it where most of its neighbours are located. Also
/// takes into account the number of vertices on the destination and source
/// machine.
class Partitioner {
/// The partitioner needs GraphDb because each partition step is a new
/// database transactions (database accessor has to be created).
/// TODO (buda): Consider passing GraphDbAccessor directly.
explicit Partitioner(database::GraphDb *db);
Partitioner(const Partitioner &other) = delete;
Partitioner(Partitioner &&other) = delete;
Partitioner &operator=(const Partitioner &other) = delete;
Partitioner &operator=(Partitioner &&other) = delete;
/// Runs one dynamic graph partitioning cycle (step). In case of any error,
/// the transaction will be aborted.
/// @return Calculated partitioning score and were the migrations successful.
std::pair<double, bool> Partition();
/// Returns a vector of pairs of `vertex` and `destination` of where should
/// some vertex be relocated from the view of `dba` accessor.
/// Each vertex is located on some worker (which in context of migrations we
/// call a vertex label). Each vertex has it's score for each different label
/// (worker_id) evaluated. This score is calculated by considering
/// neighbouring vertices labels. Simply put, each vertex is attracted to be
/// located on the same worker as it's neighbouring vertices. Migrations which
/// improve that scoring, which also takes into account saturation of other
/// workers on which it's considering to migrate this vertex, are determined.
MigrationsData FindMigrations(database::GraphDbAccessor &dba);
/// Counts number of each label (worker_id) on endpoints of edges (in/out) of
/// `vertex`.
/// @return A map consisting of (label/machine/instance id, count) key-value
/// pairs.
std::unordered_map<int, int64_t> CountLabels(
const VertexAccessor &vertex) const;
database::GraphDb *db_{nullptr};
} // namespace distributed::dgp

View File

@ -1,62 +0,0 @@
#include "distributed/dgp/vertex_migrator.hpp"
#include "database/distributed/distributed_graph_db.hpp"
#include "database/distributed/graph_db_accessor.hpp"
#include "query/typed_value.hpp"
namespace distributed::dgp {
VertexMigrator::VertexMigrator(database::GraphDbAccessor *dba) : dba_(dba) {}
void VertexMigrator::MigrateVertex(VertexAccessor &vertex, int destination) {
auto get_props = [](auto &record) {
std::unordered_map<storage::Property, PropertyValue> properties;
for (auto prop : record.Properties()) {
properties[prop.first] = prop.second;
return properties;
auto update_if_moved = [this](auto &vertex) {
if (vertex_migrated_to_.count(vertex.gid())) {
vertex = VertexAccessor(vertex_migrated_to_[vertex.gid()], *dba_);
auto relocated_vertex = database::InsertVertexIntoRemote(
dba_, destination, vertex.labels(), get_props(vertex), vertex.CypherId());
vertex_migrated_to_[vertex.gid()] = relocated_vertex.address();
for (auto out_edge : vertex.out()) {
auto to =;
// Here cypher_id has to be passed to the other machine because this
// machine owns the edge.
auto new_out_edge =
dba_->InsertEdge(relocated_vertex, to, out_edge.EdgeType(),
std::nullopt, out_edge.CypherId());
for (auto prop : get_props(out_edge)) {
new_out_edge.PropsSet(prop.first, prop.second);
for (auto in_edge : {
auto from = in_edge.from();
// Continue on self-loops since those edges have already been added
// while iterating over out edges.
if (from == vertex) continue;
// Both gid and cypher_id should be without value because this machine
// doesn't own the edge.
auto new_in_edge =
dba_->InsertEdge(from, relocated_vertex, in_edge.EdgeType(),
std::nullopt, in_edge.CypherId());
for (auto prop : get_props(in_edge)) {
new_in_edge.PropsSet(prop.first, prop.second);
} // namespace distributed::dgp

View File

@ -1,37 +0,0 @@
/// @file
#pragma once
#include <thread>
#include <unordered_map>
#include "storage/distributed/gid.hpp"
#include "storage/distributed/vertex_accessor.hpp"
namespace database {
class GraphDbAccessor;
}; // namespace database
namespace distributed::dgp {
/// Migrates vertices from one worker to another (updates edges as well).
class VertexMigrator {
explicit VertexMigrator(database::GraphDbAccessor *dba);
VertexMigrator(const VertexMigrator &other) = delete;
VertexMigrator(VertexMigrator &&other) = delete;
VertexMigrator &operator=(const VertexMigrator &other) = delete;
VertexMigrator &operator=(VertexMigrator &&other) = delete;
/// Creates a new vertex on the destination, deletes the old `vertex`, and
/// deletes/creates every new edge that it needs since the destination of the
/// vertex changed.
void MigrateVertex(VertexAccessor &v, int destination);
database::GraphDbAccessor *dba_;
std::unordered_map<gid::Gid, storage::VertexAddress> vertex_migrated_to_;
} // namespace distributed::dgp

View File

@ -1,52 +0,0 @@
#include "distributed/durability_rpc_master.hpp"
#include "distributed/durability_rpc_messages.hpp"
#include "transactions/transaction.hpp"
#include "utils/future.hpp"
namespace distributed {
utils::Future<bool> DurabilityRpcMaster::MakeSnapshot(tx::TransactionId tx) {
return utils::make_future(std::async(std::launch::async, [this, tx] {
auto futures = coordination_->ExecuteOnWorkers<bool>(
0, [tx](int worker_id, communication::rpc::ClientPool &client_pool) {
try {
auto res = client_pool.Call<MakeSnapshotRpc>(tx);
return res.member;
} catch (const communication::rpc::RpcFailedException &e) {
return false;
bool created = true;
for (auto &future : futures) {
created &= future.get();
return created;
utils::Future<bool> DurabilityRpcMaster::RecoverWalAndIndexes(
durability::RecoveryData *recovery_data) {
return utils::make_future(
std::async(std::launch::async, [this, recovery_data] {
auto futures = coordination_->ExecuteOnWorkers<bool>(
0, [recovery_data](int worker_id,
communication::rpc::ClientPool &client_pool) {
try {
return true;
} catch (const communication::rpc::RpcFailedException &e) {
return false;
bool recovered = true;
for (auto &future : futures) {
recovered &= future.get();
return recovered;
} // namespace distributed

View File

@ -1,33 +0,0 @@
#pragma once
#include <future>
#include <mutex>
#include <utility>
#include "distributed/coordination.hpp"
#include "durability/distributed/recovery.hpp"
#include "storage/distributed/gid.hpp"
#include "transactions/type.hpp"
namespace distributed {
/// Provides an ability to trigger snapshooting on other workers.
class DurabilityRpcMaster {
explicit DurabilityRpcMaster(Coordination *coordination)
: coordination_(coordination) {}
// Sends a snapshot request to workers and returns a future which becomes true
// if all workers sucesfully completed their snapshot creation, false
// otherwise
// @param tx - transaction from which to take db snapshot
utils::Future<bool> MakeSnapshot(tx::TransactionId tx);
utils::Future<bool> RecoverWalAndIndexes(
durability::RecoveryData *recovery_data);
Coordination *coordination_;
} // namespace distributed

View File

@ -1,20 +0,0 @@
#pragma once
#include "communication/rpc/messages.hpp"
#include "durability/distributed/recovery.hpp"
#include "durability/distributed/serialization.hpp"
#include "transactions/transaction.hpp"
(lcp:namespace distributed)
(lcp:define-rpc make-snapshot
(:request ((member "::tx::TransactionId")))
(:response ((member :bool))))
(lcp:define-rpc recover-wal-and-indexes
(:request ((member "::durability::RecoveryData")))
(:response ()))
(lcp:pop-namespace) ;; distributed

View File

@ -1,31 +0,0 @@
#include "distributed/durability_rpc_worker.hpp"
#include "database/distributed/distributed_graph_db.hpp"
#include "database/distributed/graph_db_accessor.hpp"
#include "distributed/durability_rpc_messages.hpp"
namespace distributed {
database::Worker *db, distributed::Coordination *coordination)
: db_(db) {
[this](auto *req_reader, auto *res_builder) {
MakeSnapshotReq req;
slk::Load(&req, req_reader);
auto dba = db_->Access(req.member);
MakeSnapshotRes res(db_->MakeSnapshot(*dba));
slk::Save(res, res_builder);
[this](auto *req_reader, auto *res_builder) {
RecoverWalAndIndexesReq req;
slk::Load(&req, req_reader);
RecoverWalAndIndexesRes res;
slk::Save(res, res_builder);
} // namespace distributed

View File

@ -1,19 +0,0 @@
#pragma once
#include "distributed/coordination.hpp"
namespace database {
class Worker;
}; // namespace database
namespace distributed {
class DurabilityRpcWorker {
DurabilityRpcWorker(database::Worker *db, distributed::Coordination *coordination);
database::Worker *db_;
} // namespace distributed

View File

@ -1,43 +0,0 @@
#include "distributed/dynamic_worker.hpp"
#include "database/distributed/graph_db.hpp"
#include "distributed/dynamic_worker_rpc_messages.hpp"
namespace distributed {
DynamicWorkerAddition::DynamicWorkerAddition(database::GraphDb *db,
distributed::Coordination *coordination)
: db_(db), coordination_(coordination) {
[this](auto *req_reader, auto *res_builder) {
DynamicWorkerReq req;
slk::Load(&req, req_reader);
DynamicWorkerRes res(this->GetIndicesToCreate());
slk::Save(res, res_builder);
std::vector<std::pair<std::string, std::string>>
DynamicWorkerAddition::GetIndicesToCreate() {
std::vector<std::pair<std::string, std::string>> indices;
if (!enabled_.load()) return indices;
for (const auto &key : db_->storage().label_property_index().Keys()) {
auto label = db_->label_mapper().id_to_value(key.label_);
auto property = db_->property_mapper().id_to_value(key.property_);
indices.emplace_back(label, property);
return indices;
void DynamicWorkerAddition::Enable() {; }
DynamicWorkerRegistration::DynamicWorkerRegistration(communication::rpc::ClientPool *client_pool)
: client_pool_(client_pool) {}
std::vector<std::pair<std::string, std::string>>
DynamicWorkerRegistration::GetIndicesToCreate() {
auto result = client_pool_->Call<DynamicWorkerRpc>();
return result.recover_indices;
} // namespace distributed

View File

@ -1,46 +0,0 @@
/// @file
#pragma once
#include <atomic>
#include <string>
#include <vector>
#include "communication/rpc/client_pool.hpp"
#include "distributed/coordination.hpp"
namespace database {
class GraphDb;
} // namespace database
namespace distributed {
class DynamicWorkerAddition final {
DynamicWorkerAddition(database::GraphDb *db,
distributed::Coordination *coordination);
/// Enable dynamic worker addition.
void Enable();
database::GraphDb *db_{nullptr};
distributed::Coordination *coordination_;
std::atomic<bool> enabled_{false};
/// Return the indices a dynamically added worker needs to create.
std::vector<std::pair<std::string, std::string>> GetIndicesToCreate();
class DynamicWorkerRegistration final {
explicit DynamicWorkerRegistration(
communication::rpc::ClientPool *client_pool);
/// Make a RPC call to master to get indices to create.
std::vector<std::pair<std::string, std::string>> GetIndicesToCreate();
communication::rpc::ClientPool *client_pool_;
} // namespace distributed

View File

@ -1,18 +0,0 @@
#pragma once
#include <vector>
#include <string>
#include "communication/rpc/messages.hpp"
#include "slk/serialization.hpp"
(lcp:namespace distributed)
(lcp:define-rpc dynamic-worker
(:request ())
((recover-indices "std::vector<std::pair<std::string, std::string>>"))))
(lcp:pop-namespace) ;; distributed

View File

@ -1,28 +0,0 @@
#pragma once
#include <memory>
#include <string>
#include "communication/rpc/messages.hpp"
#include "storage/common/types/types.hpp"
#include "storage/distributed/rpc/serialization.hpp"
#include "transactions/transaction.hpp"
(lcp:namespace distributed)
(lcp:define-rpc populate-index
((label "::storage::Label")
(property "::storage::Property")
(tx-id "::tx::TransactionId")))
(:response ()))
(lcp:define-rpc create-index
((label "::storage::Label")
(property "::storage::Property")))
(:response ()))
(lcp:pop-namespace) ;; distributed

View File

@ -1,35 +0,0 @@
#include "distributed/index_rpc_server.hpp"
#include "database/distributed/graph_db.hpp"
#include "database/distributed/graph_db_accessor.hpp"
#include "distributed/index_rpc_messages.hpp"
namespace distributed {
IndexRpcServer::IndexRpcServer(database::GraphDb *db,
distributed::Coordination *coordination)
: db_(db) {
[this](auto *req_reader, auto *res_builder) {
CreateIndexReq req;
slk::Load(&req, req_reader);
database::LabelPropertyIndex::Key key{req.label,};
CreateIndexRes res;
slk::Save(res, res_builder);
[this](auto *req_reader, auto *res_builder) {
PopulateIndexReq req;
slk::Load(&req, req_reader);
database::LabelPropertyIndex::Key key{req.label,};
auto dba = db_->Access(req.tx_id);
PopulateIndexRes res;
slk::Save(res, res_builder);
} // namespace distributed

View File

@ -1,19 +0,0 @@
#pragma once
#include "distributed/coordination.hpp"
namespace database {
class GraphDb;
namespace distributed {
class IndexRpcServer {
IndexRpcServer(database::GraphDb *db, distributed::Coordination *coordination);
database::GraphDb *db_;
} // namespace distributed

View File

@ -1,44 +0,0 @@
#include "distributed/plan_consumer.hpp"
namespace distributed {
PlanConsumer::PlanConsumer(distributed::Coordination *coordination) {
[this](auto *req_reader, auto *res_builder) {
DispatchPlanReq req;
slk::Load(&req, req_reader);
req.plan_id, std::make_unique<PlanPack>(req.plan, req.symbol_table,
DispatchPlanRes res;
slk::Save(res, res_builder);
[this](auto *req_reader, auto *res_builder) {
RemovePlanReq req;
slk::Load(&req, req_reader);
RemovePlanRes res;
slk::Save(res, res_builder);
PlanConsumer::PlanPack &PlanConsumer::PlanForId(int64_t plan_id) const {
auto accessor = plan_cache_.access();
auto found = accessor.find(plan_id);
CHECK(found != accessor.end())
<< "Missing plan and symbol table for plan id: " << plan_id;
return *found->second;
std::vector<int64_t> PlanConsumer::CachedPlanIds() const {
std::vector<int64_t> plan_ids;
auto access = plan_cache_.access();
for (auto &kv : access) plan_ids.emplace_back(kv.first);
return plan_ids;
} // namespace distributed

View File

@ -1,43 +0,0 @@
#pragma once
#include <vector>
#include "distributed/coordination.hpp"
#include "data_structures/concurrent/concurrent_map.hpp"
#include "distributed/plan_rpc_messages.hpp"
#include "query/frontend/semantic/symbol_table.hpp"
#include "query/plan/operator.hpp"
namespace distributed {
/** Handles plan consumption from master. Creates and holds a local cache of
* plans. Worker side. */
class PlanConsumer {
struct PlanPack {
PlanPack(std::shared_ptr<query::plan::LogicalOperator> plan,
query::SymbolTable symbol_table, query::AstStorage storage)
: plan(plan),
storage(std::move(storage)) {}
std::shared_ptr<query::plan::LogicalOperator> plan;
query::SymbolTable symbol_table;
const query::AstStorage storage;
explicit PlanConsumer(distributed::Coordination *coordination);
/** Return cached plan and symbol table for a given plan id. */
PlanPack &PlanForId(int64_t plan_id) const;
/** Return the ids of all the cached plans. For testing. */
std::vector<int64_t> CachedPlanIds() const;
// TODO remove unique_ptr. This is to get it to work, emplacing into a
// ConcurrentMap is tricky.
mutable ConcurrentMap<int64_t, std::unique_ptr<PlanPack>> plan_cache_;
} // namespace distributed

View File

@ -1,32 +0,0 @@
#include <distributed/plan_dispatcher.hpp>
namespace distributed {
PlanDispatcher::PlanDispatcher(Coordination *coordination) : coordination_(coordination) {}
void PlanDispatcher::DispatchPlan(
int64_t plan_id, std::shared_ptr<query::plan::LogicalOperator> plan,
const query::SymbolTable &symbol_table) {
auto futures = coordination_->ExecuteOnWorkers<void>(
0, [plan_id, plan, symbol_table](
int worker_id, communication::rpc::ClientPool &client_pool) {
client_pool.Call<DispatchPlanRpc>(plan_id, plan, symbol_table);
for (auto &future : futures) {
void PlanDispatcher::RemovePlan(int64_t plan_id) {
auto futures = coordination_->ExecuteOnWorkers<void>(
0, [plan_id](int worker_id, communication::rpc::ClientPool &client_pool) {
for (auto &future : futures) {
} // namespace distributed

View File

@ -1,29 +0,0 @@
#pragma once
#include "distributed/coordination.hpp"
#include "distributed/plan_rpc_messages.hpp"
#include "query/frontend/semantic/symbol_table.hpp"
#include "query/plan/operator.hpp"
namespace distributed {
/** Handles plan dispatching to all workers. Uses MasterCoordination to
* acomplish that. Master side.
class PlanDispatcher {
explicit PlanDispatcher(Coordination *coordination);
/** Dispatch a plan to all workers and wait for their acknowledgement. */
void DispatchPlan(int64_t plan_id,
std::shared_ptr<query::plan::LogicalOperator> plan,
const query::SymbolTable &symbol_table);
/** Remove a plan from all workers and wait for their acknowledgement. */
void RemovePlan(int64_t plan_id);
Coordination *coordination_;
} // namespace distributed

View File

@ -1,46 +0,0 @@
#pragma once
#include "communication/rpc/messages.hpp"
#include "query/frontend/ast/ast.hpp"
#include "query/frontend/semantic/symbol_table.hpp"
#include "query/distributed/plan/ops.hpp"
(lcp:namespace distributed)
(defun slk-save-plan (member)
query::plan::LogicalOperator::SaveHelper helper;
self.${member}, builder, &helper.saved_ops,
[&helper](const auto &val, auto *builder) {
slk::Save(val, builder, &helper);
(defun slk-load-plan (member)
query::plan::LogicalOperator::SlkLoadHelper helper;
slk::Load<query::plan::LogicalOperator>(&self->${member}, reader, &helper.loaded_ops,
[&helper](auto *op, auto *reader) {
slk::ConstructAndLoad(op, reader, &helper);
self->storage = std::move(helper.ast_storage);
(lcp:define-rpc dispatch-plan
((plan-id :int64_t)
(plan "std::shared_ptr<query::plan::LogicalOperator>"
:slk-save #'slk-save-plan
:slk-load #'slk-load-plan)
(symbol-table "::query::SymbolTable")
(storage "::query::AstStorage" :initarg nil :dont-save t)))
(:response ()))
(lcp:define-rpc remove-plan
(:request ((member :int64_t)))
(:response ()))
(lcp:pop-namespace) ;; distributed

View File

@ -1,212 +0,0 @@
#include "distributed/produce_rpc_server.hpp"
#include "database/distributed/distributed_graph_db.hpp"
#include "distributed/data_manager.hpp"
#include "distributed/pull_produce_rpc_messages.hpp"
#include "query/common.hpp"
#include "query/exceptions.hpp"
#include "transactions/distributed/engine_worker.hpp"
namespace distributed {
database::Worker *db, tx::TransactionId tx_id,
const PlanConsumer::PlanPack &plan_pack, int64_t timestamp,
const query::Parameters &parameters,
std::vector<query::Symbol> pull_symbols)
: dba_(db->Access(tx_id)),
cursor_(plan_pack.plan->MakeCursor(execution_memory_.get())) {
context_.symbol_table = plan_pack.symbol_table;
// TODO: Maybe we want a seperate MemoryResource per pull evaluation
context_.evaluation_context.memory = execution_memory_.get();
context_.evaluation_context.timestamp = timestamp;
context_.evaluation_context.parameters = parameters; =
query::NamesToProperties(, dba_.get());
context_.evaluation_context.labels =
query::NamesToLabels(, dba_.get());
std::pair<std::vector<query::TypedValue>, PullState>
ProduceRpcServer::OngoingProduce::Pull() {
if (!accumulation_.empty()) {
auto results = std::move(accumulation_.back());
for (auto &element : results) {
try {
} catch (query::ReconstructionException &) {
cursor_state_ = PullState::RECONSTRUCTION_ERROR;
return std::make_pair(std::move(results), cursor_state_);
return std::make_pair(std::move(results), PullState::CURSOR_IN_PROGRESS);
return PullOneFromCursor();
PullState ProduceRpcServer::OngoingProduce::Accumulate() {
while (true) {
auto result = PullOneFromCursor();
if (result.second != PullState::CURSOR_IN_PROGRESS)
return result.second;
void ProduceRpcServer::OngoingProduce::Reset() {
cursor_state_ = PullState::CURSOR_IN_PROGRESS;
std::pair<std::vector<query::TypedValue>, PullState>
ProduceRpcServer::OngoingProduce::PullOneFromCursor() {
std::vector<query::TypedValue> results;
// Check if we already exhausted this cursor (or it entered an error
// state). This happens when we accumulate before normal pull.
if (cursor_state_ != PullState::CURSOR_IN_PROGRESS) {
return std::make_pair(results, cursor_state_);
try {
if (cursor_->Pull(frame_, context_)) {
for (const auto &symbol : pull_symbols_) {
} else {
cursor_state_ = PullState::CURSOR_EXHAUSTED;
} catch (const mvcc::SerializationError &) {
cursor_state_ = PullState::SERIALIZATION_ERROR;
} catch (const utils::LockTimeoutException &) {
cursor_state_ = PullState::LOCK_TIMEOUT_ERROR;
} catch (const RecordDeletedError &) {
cursor_state_ = PullState::UPDATE_DELETED_ERROR;
} catch (const query::ReconstructionException &) {
cursor_state_ = PullState::RECONSTRUCTION_ERROR;
} catch (const query::RemoveAttachedVertexException &) {
cursor_state_ = PullState::UNABLE_TO_DELETE_VERTEX_ERROR;
} catch (const query::QueryRuntimeException &) {
cursor_state_ = PullState::QUERY_ERROR;
} catch (const query::HintedAbortError &) {
cursor_state_ = PullState::HINTED_ABORT_ERROR;
return std::make_pair(std::move(results), cursor_state_);
ProduceRpcServer::ProduceRpcServer(database::Worker *db,
tx::EngineWorker *tx_engine,
distributed::Coordination *coordination,
const PlanConsumer &plan_consumer,
DataManager *data_manager)
: db_(db),
tx_engine_(tx_engine) {
[this](auto *req_reader, auto *res_builder) {
PullReq req;
slk::Load(&req, req_reader);
PullRes res(Pull(req));
slk::Save(res, res_builder);
[this](auto *req_reader, auto *res_builder) {
ResetCursorReq req;
slk::Load(&req, req_reader);
ResetCursorRes res;
slk::Save(res, res_builder);
[this, data_manager](auto *req_reader, auto *res_builder) {
TransactionCommandAdvancedReq req;
slk::Load(&req, req_reader);
TransactionCommandAdvancedRes res;
slk::Save(res, res_builder);
void ProduceRpcServer::ClearTransactionalCache(
tx::TransactionId oldest_active) {
std::lock_guard<std::mutex> guard{ongoing_produces_lock_};
for (auto it = ongoing_produces_.begin(); it != ongoing_produces_.end();) {
if (std::get<0>(it->first) < oldest_active) {
it = ongoing_produces_.erase(it);
} else {
ProduceRpcServer::OngoingProduce &ProduceRpcServer::GetOngoingProduce(
const PullReq &req) {
auto key_tuple = std::make_tuple(req.tx_id, req.command_id, req.plan_id);
std::lock_guard<std::mutex> guard{ongoing_produces_lock_};
auto found = ongoing_produces_.find(key_tuple);
if (found != ongoing_produces_.end()) {
return found->second;
// On the worker cache the snapshot to have one RPC less.
tx_engine_->RunningTransaction(req.tx_id, req.tx_snapshot);
auto &plan_pack = plan_consumer_.PlanForId(req.plan_id);
return ongoing_produces_
.emplace(std::piecewise_construct, std::forward_as_tuple(key_tuple),
std::forward_as_tuple(db_, req.tx_id, plan_pack, req.timestamp,
req.parameters, req.symbols))
PullResData ProduceRpcServer::Pull(const PullReq &req) {
auto &ongoing_produce = GetOngoingProduce(req);
PullResData result(db_->WorkerId(), req.send_versions);
result.pull_state = PullState::CURSOR_IN_PROGRESS;
if (req.accumulate) {
result.pull_state = ongoing_produce.Accumulate();
// If an error ocurred, we need to return that error.
if (result.pull_state != PullState::CURSOR_EXHAUSTED) {
return result;
for (int i = 0; i < req.batch_size; ++i) {
auto pull_result = ongoing_produce.Pull();
result.pull_state = pull_result.second;
if (pull_result.second != PullState::CURSOR_IN_PROGRESS) break;
return result;
void ProduceRpcServer::Reset(const ResetCursorReq &req) {
auto key_tuple = std::make_tuple(req.tx_id, req.command_id, req.plan_id);
std::lock_guard<std::mutex> guard{ongoing_produces_lock_};
auto found = ongoing_produces_.find(key_tuple);
// It is fine if the cursor doesn't exist yet. Creating a new cursor is the
// same thing as reseting an existing one.
if (found != ongoing_produces_.end()) {
} // namespace distributed

View File

@ -1,111 +0,0 @@
/// @file
#pragma once
#include <cstdint>
#include <map>
#include <mutex>
#include <utility>
#include <vector>
#include "database/distributed/graph_db.hpp"
#include "database/distributed/graph_db_accessor.hpp"
#include "distributed/coordination.hpp"
#include "distributed/plan_consumer.hpp"
#include "distributed/pull_produce_rpc_messages.hpp"
#include "query/context.hpp"
#include "query/frontend/semantic/symbol_table.hpp"
#include "query/interpret/frame.hpp"
#include "query/plan/operator.hpp"
#include "query/typed_value.hpp"
#include "transactions/type.hpp"
namespace database {
class Worker;
namespace tx {
class EngineWorker;
namespace distributed {
class DataManager;
/// Handles the execution of a plan on the worker, requested by the remote
/// master. Assumes that (tx id, command id, plan id) uniquely identifies an
/// execution, and that there will never be parallel requests for the same
/// execution thus identified.
class ProduceRpcServer {
/// Encapsulates a Cursor execution in progress. Can be used for pulling a
/// single result from the execution, or pulling all and accumulating the
/// results. Accumulations are used for synchronizing updates in distributed
/// MG (see query::plan::Synchronize).
class OngoingProduce {
OngoingProduce(database::Worker *db, tx::TransactionId tx_id,
const PlanConsumer::PlanPack &plan_pack, int64_t timestamp,
const query::Parameters &parameters,
std::vector<query::Symbol> pull_symbols);
/// Returns a vector of typed values (one for each `pull_symbol`), and an
/// indication of the pull result. The result data is valid only if the
/// returned state is CURSOR_IN_PROGRESS.
std::pair<std::vector<query::TypedValue>, PullState> Pull();
/// Accumulates all the frames pulled from the cursor and returns
/// CURSOR_EXHAUSTED. If an error occurs, an appropriate value is returned.
PullState Accumulate();
void Reset();
std::unique_ptr<database::GraphDbAccessor> dba_;
query::ExecutionContext context_;
std::vector<query::Symbol> pull_symbols_;
query::Frame frame_;
PullState cursor_state_{PullState::CURSOR_IN_PROGRESS};
std::vector<std::vector<query::TypedValue>> accumulation_;
// execution_memory_ is unique_ptr because we are passing the address to
// cursor_, and we want to preserve the pointer in case we get moved.
std::unique_ptr<utils::MonotonicBufferResource> execution_memory_;
query::plan::UniqueCursorPtr cursor_;
/// Pulls and returns a single result from the cursor.
std::pair<std::vector<query::TypedValue>, PullState> PullOneFromCursor();
ProduceRpcServer(database::Worker *db, tx::EngineWorker *tx_engine,
distributed::Coordination *coordination,
const PlanConsumer &plan_consumer,
DataManager *data_manager);
/// Clears all ongoing produces that are older than the oldest active
/// transaction. This function should be registered in the transaction engine
/// for transactional cache cleanup.
void ClearTransactionalCache(tx::TransactionId oldest_active);
std::mutex ongoing_produces_lock_;
/// Mapping of (tx id, command id, plan id) to OngoingProduce.
/// The command_id should be the command_id at the initialization of a cursor
/// that can call ProduceRpcServer.
std::map<std::tuple<tx::TransactionId, tx::CommandId, int64_t>,
database::Worker *db_;
const distributed::PlanConsumer &plan_consumer_;
tx::EngineWorker *tx_engine_;
/// Gets an ongoing produce for the given pull request. Creates a new one if
/// there is none currently existing.
OngoingProduce &GetOngoingProduce(const PullReq &req);
/// Performs a single remote pull for the given request.
PullResData Pull(const PullReq &req);
/// Resets the cursor for an ongoing produce.
void Reset(const ResetCursorReq &req);
} // namespace distributed

View File

@ -1,206 +0,0 @@
#pragma once
#include <cstdint>
#include <functional>
#include <string>
#include "communication/rpc/messages.hpp"
#include "query/context.hpp"
#include "query/frontend/semantic/symbol.hpp"
#include "query/parameters.hpp"
#include "query/distributed/serialization.hpp"
#include "storage/distributed/address_types.hpp"
#include "transactions/type.hpp"
#include "database/distributed/graph_db.hpp"
#include "database/distributed/graph_db_accessor.hpp"
#include "distributed/data_manager.hpp"
(load "transactions/distributed/serialization.lcp")
(lcp:namespace distributed)
// Forward declare for LoadGraphElement.
class DataManager;
/// The default number of results returned via RPC from remote execution to the
/// master that requested it.
constexpr int kDefaultBatchSize = 20;
(lcp:define-enum pull-state
(:documentation "Returned along with a batch of results in the remote-pull
RPC. Indicates the state of execution on the worker.")
(lcp:define-struct pull-data ()
((pull-state "PullState")
(frames "std::vector<std::vector<query::TypedValue>>"))
"The data returned to the end consumer (the Pull operator). Contains only
the relevant parts of the response, ready for use."))
(defun slk-save-frames (member)
size_t frame_count = self.${member}.size();
slk::Save(frame_count, builder);
for (const auto &frame : self.${member}) {
size_t frame_size = frame.size();
slk::Save(frame_size, builder);
for (const auto &value : frame) {
slk::Save(value, builder, self.send_versions, self.worker_id);
(defun slk-load-frames (member)
size_t frame_count = 0;
slk::Load(&frame_count, reader);
for (size_t frame_i = 0; frame_i < frame_count; ++frame_i) {
size_t frame_size = 0;
slk::Load(&frame_size, reader);
std::vector<query::TypedValue> frame(frame_size);
for (size_t val_i = 0; val_i < frame_size; ++val_i) {
slk::Load(&frame[val_i], reader, dba, data_manager);
(lcp:define-struct pull-res-data ()
((pull-state "PullState")
(frames "std::vector<std::vector<query::TypedValue>>"
:slk-save #'slk-save-frames
:slk-load #'slk-load-frames)
(worker-id :int16_t :dont-save t
"Id of the worker on which the response is created, used for
serializing vertices (converting local to global addresses). Indicates which
of (old, new) records of a graph element should be sent.")
(send-versions "::storage::SendVersions" :dont-save t)
;; Temporary caches used between deserialization and post-processing
;; (transfering the ownership of this data to a Cache).
(vertices "std::vector<GraphElementData<Vertex>>" :dont-save t)
(edges "std::vector<GraphElementData<Edge>>" :dont-save t)
(paths "std::vector<PathData>" :dont-save t))
"The data of the remote pull response. Post-processing is required after
deserialization to initialize Vertex/Edge typed values in the frames (possibly
encapsulated in lists/maps) to their proper values. This requires a
GraphDbAccessor and therefore can't be done as part of deserialization.
TODO - make it possible to inject a &GraphDbAcessor from the Pull layer all
the way into RPC data deserialization to remove the requirement for
post-processing. The current approach of holding references to parts of the
frame (potentially embedded in lists/maps) is too error-prone.")
(lcp:define-struct (graph-element-data t-record) ()
((cypher-id :int64_t)
(global-address "::storage::Address<mvcc::VersionList<TRecord>>")
(old-record "std::unique_ptr<TRecord>")
(new-record "std::unique_ptr<TRecord>")
"::query::TypedValue *"
"The position in frame is optional. This same structure is used for
deserializing path elements, in which case the vertex/edge in question is not
directly part of the frame."))
"Temp cache for deserialized vertices and edges. These objects are
created during deserialization. They are used immediatelly after during
post-processing. The vertex/edge data ownership gets transfered to the Cache,
and the `element_in_frame` reference is used to set the appropriate accessor
to the appropriate value. Not used on side that generates the response.")
GraphElementData(int64_t cypher_id, storage::Address<mvcc::VersionList<TRecord>> address,
std::unique_ptr<TRecord> old_record, std::unique_ptr<TRecord> new_record,
query::TypedValue *element_in_frame)
: cypher_id(cypher_id),
element_in_frame(element_in_frame) {}
(lcp:define-struct path-data ()
((vertices "std::vector<GraphElementData<Vertex>>")
(edges "std::vector<GraphElementData<Edge>>")
(path-in-frame "query::TypedValue *"))
PathData(query::TypedValue *path_in_frame) : path_in_frame(path_in_frame) {}
(:documentation "Same like `GraphElementData`, but for paths."))
PullResData() {} // Default constructor required for serialization.
PullResData(int worker_id, storage::SendVersions send_versions)
: worker_id(worker_id), send_versions(send_versions) {}
PullResData(const PullResData &) = delete;
PullResData &operator=(const PullResData &) = delete;
PullResData(PullResData &&) = default;
PullResData &operator=(PullResData &&) = default;
(:serialize (:slk :load-args '((dba "database::GraphDbAccessor *")
(data-manager "distributed::DataManager *")))))
(lcp:define-rpc pull
((tx-id "::tx::TransactionId")
(tx-snapshot "::tx::Snapshot"
:slk-save #'slk-save-snapshot
:slk-load #'slk-load-snapshot)
(plan-id :int64_t)
(command-id "::tx::CommandId")
(timestamp :int64_t)
(parameters "::query::Parameters")
(symbols "std::vector<query::Symbol>")
(accumulate :bool)
(batch-size :int64_t)
;; Indicates which of (old, new) records of a graph element should be sent.
(send-versions "::storage::SendVersions")))
((data "PullResData" :initarg :move
:slk-load (lambda (m)
slk::Load(&self->${m}, reader, dba, data_manager);
(:serialize (:slk :load-args '((dba "database::GraphDbAccessor *")
(data-manager "distributed::DataManager *"))))))
;; TODO make a separate RPC for the continuation of an existing pull, as an
;; optimization not to have to send the full PullReqData pack every time.
(lcp:define-rpc reset-cursor
((tx-id "::tx::TransactionId")
(plan-id :int64_t)
(command-id "::tx::CommandId")))
(:response ()))
(lcp:define-rpc transaction-command-advanced
(:request ((member "::tx::TransactionId")))
(:response ()))
(lcp:pop-namespace) ;; distributed

View File

@ -1,52 +0,0 @@
#include "distributed/pull_rpc_clients.hpp"
#include <functional>
#include "storage/distributed/edge.hpp"
#include "storage/distributed/vertex.hpp"
namespace distributed {
utils::Future<PullData> PullRpcClients::Pull(
database::GraphDbAccessor *dba, int worker_id, int64_t plan_id,
tx::CommandId command_id,
const query::EvaluationContext &evaluation_context,
const std::vector<query::Symbol> &symbols, bool accumulate,
int batch_size) {
return coordination_->ExecuteOnWorker<PullData>(
worker_id, [data_manager = data_manager_, dba, plan_id, command_id,
evaluation_context, symbols, accumulate,
batch_size](int worker_id, ClientPool &client_pool) {
auto load_pull_res = [data_manager, dba](auto *res_reader) {
PullRes res;
slk::Load(&res, res_reader, dba, data_manager);
return res;
auto result = client_pool.CallWithLoad<PullRpc>(
load_pull_res, dba->transaction_id(), dba->transaction().snapshot(),
plan_id, command_id, evaluation_context.timestamp,
evaluation_context.parameters, symbols, accumulate, batch_size,
return PullData{, std::move(};
utils::Future<void> PullRpcClients::ResetCursor(database::GraphDbAccessor *dba,
int worker_id, int64_t plan_id,
tx::CommandId command_id) {
return coordination_->ExecuteOnWorker<void>(
worker_id, [dba, plan_id, command_id](int worker_id, auto &client) {
client.template Call<ResetCursorRpc>(dba->transaction_id(), plan_id,
PullRpcClients::NotifyAllTransactionCommandAdvanced(tx::TransactionId tx_id) {
return coordination_->ExecuteOnWorkers<void>(
0, [tx_id](int worker_id, auto &client) {
client.template Call<TransactionCommandAdvancedRpc>(tx_id);
} // namespace distributed

View File

@ -1,55 +0,0 @@
#pragma once
#include <vector>
#include "database/distributed/graph_db_accessor.hpp"
#include "distributed/coordination.hpp"
#include "distributed/pull_produce_rpc_messages.hpp"
#include "query/context.hpp"
#include "query/frontend/semantic/symbol.hpp"
#include "transactions/type.hpp"
#include "utils/future.hpp"
namespace distributed {
class DataManager;
/// Provides means of calling for the execution of a plan on some remote worker,
/// and getting the results of that execution. The results are returned in
/// batches and are therefore accompanied with an enum indicator of the state of
/// remote execution.
class PullRpcClients {
using ClientPool = communication::rpc::ClientPool;
PullRpcClients(Coordination *coordination, DataManager *data_manager)
: coordination_(coordination), data_manager_(data_manager) {}
/// Calls a remote pull asynchroniously. IMPORTANT: take care not to call this
/// function for the same (tx_id, worker_id, plan_id, command_id) before the
/// previous call has ended.
/// @todo: it might be cleaner to split Pull into {InitRemoteCursor,
/// Pull, RemoteAccumulate}, but that's a lot of refactoring and more
/// RPC calls.
utils::Future<PullData> Pull(
database::GraphDbAccessor *dba, int worker_id, int64_t plan_id,
tx::CommandId command_id,
const query::EvaluationContext &evaluation_context,
const std::vector<query::Symbol> &symbols, bool accumulate,
int batch_size = kDefaultBatchSize);
utils::Future<void> ResetCursor(database::GraphDbAccessor *dba, int worker_id,
int64_t plan_id, tx::CommandId command_id);
auto GetWorkerIds() { return coordination_->GetWorkerIds(); }
std::vector<utils::Future<void>> NotifyAllTransactionCommandAdvanced(
tx::TransactionId tx_id);
Coordination *coordination_;
DataManager *data_manager_;
} // namespace distributed

View File

@ -1,18 +0,0 @@
#pragma once
#include "communication/rpc/messages.hpp"
#include "io/network/endpoint.hpp"
#include "slk/serialization.hpp"
#include "transactions/transaction.hpp"
(lcp:namespace distributed)
(lcp:define-rpc ran-local-gc
((local-oldest-active "::tx::TransactionId")
(worker-id :int16_t)))
(:response ()))
(lcp:pop-namespace) ;; distributed

View File

@ -1,17 +0,0 @@
#pragma once
#include <memory>
#include <string>
#include "communication/rpc/messages.hpp"
#include "slk/serialization.hpp"
(lcp:namespace distributed)
(lcp:define-rpc token-transfer
(:request ())
(:response ()))
(lcp:pop-namespace) ;; distributed

View File

@ -1,119 +0,0 @@
/// @file
#pragma once
#include "distributed/coordination.hpp"
#include "distributed/dgp/partitioner.hpp"
namespace database {
class GraphDb;
namespace distributed {
// TODO (buda): dgp_.Run() should be injected. This server shouldn't know
// anything about the partitioning.
// TODO (buda): It makes more sense to have centralized server which will assign
// tokens because error handling would be much easier.
// TODO (buda): Broken by design.
/// Shares the token between dynamic graph partitioners instances across workers
/// by passing the token from one worker to another, in a circular fashion. This
/// guarantees that no two workers will execute the dynamic graph partitioner
/// step in the same time.
class TokenSharingRpcServer {
TokenSharingRpcServer(database::GraphDb *db, int worker_id,
distributed::Coordination *coordination)
: worker_id_(worker_id), coordination_(coordination), dgp_(db) {
[this](auto *req_reader, auto *res_builder) { token_ = true; });
// TODO (buda): It's not trivial to move this part in the Start method
// because worker then doesn't run the step. Will resolve that with
// a different implementation of the token assignment.
runner_ = std::thread([this]() {
while (!shutting_down_) {
// If no other instances are connected just wait. It doesn't make sense
// to migrate anything because only one machine is available.
auto workers = coordination_->GetWorkerIds();
if (!(workers.size() > 1)) {
// Wait till we get the token.
while (!token_) {
if (shutting_down_) break;
if (shutting_down_) break;
token_ = false;
// Transfer token to next.
sort(workers.begin(), workers.end());
int next_worker = -1;
auto pos = std::upper_bound(workers.begin(), workers.end(), worker_id_);
if (pos != workers.end()) {
next_worker = *pos;
} else {
next_worker = workers[0];
// Try to transfer the token until successful.
while (!shutting_down_) {
try {
} catch (const communication::rpc::RpcFailedException &e) {
DLOG(WARNING) << "Unable to transfer token to worker "
<< next_worker;
/// Starts the token sharing server which in turn starts the dynamic graph
/// partitioner.
void Start() {
started_ = true;
token_ = true;
~TokenSharingRpcServer() {
shutting_down_ = true;
if (runner_.joinable()) runner_.join();
if (started_ && worker_id_ == 0) {
// Wait till we get the token back otherwise some worker might try to
// migrate to another worker while that worker is shutting down or
// something else bad might happen.
// TODO (buda): Solve this better in the future since this blocks
// shutting down until spinner steps complete.
while (!token_) {
// Cluster state has to be examined here because if one of the workers
// is down it doesn't make sense to wait for the token because token
// probably won't arrive back.
if (!coordination_->IsClusterAlive()) return;
int worker_id_;
distributed::Coordination *coordination_;
std::atomic<bool> started_{false};
std::atomic<bool> token_{false};
std::atomic<bool> shutting_down_{false};
std::thread runner_;
distributed::dgp::Partitioner dgp_;
} // namespace distributed

View File

@ -1,117 +0,0 @@
#include <unordered_map>
#include <vector>
#include "distributed/updates_rpc_clients.hpp"
#include "query/exceptions.hpp"
#include "utils/thread/sync.hpp"
namespace distributed {
namespace {
void RaiseIfRemoteError(UpdateResult result) {
switch (result) {
throw query::RemoveAttachedVertexException();
throw mvcc::SerializationError();
case UpdateResult::LOCK_TIMEOUT_ERROR:
throw utils::LockTimeoutException(
"Remote LockTimeoutError during edge creation");
case UpdateResult::UPDATE_DELETED_ERROR:
throw RecordDeletedError();
case UpdateResult::DONE:
} // namespace
UpdateResult UpdatesRpcClients::Update(int this_worker_id, int to_worker_id,
const database::StateDelta &delta) {
return coordination_->GetClientPool(to_worker_id)
->Call<UpdateRpc>(delta, this_worker_id)
CreatedVertexInfo UpdatesRpcClients::CreateVertex(
int worker_id, tx::TransactionId tx_id,
const std::vector<storage::Label> &labels,
const std::unordered_map<storage::Property, PropertyValue> &properties,
std::optional<int64_t> cypher_id) {
auto res = coordination_->GetClientPool(worker_id)->Call<CreateVertexRpc>(
CreateVertexReqData{tx_id, labels, properties, cypher_id});
CHECK(res.member.result == UpdateResult::DONE)
<< "Remote Vertex creation result not UpdateResult::DONE";
return CreatedVertexInfo(res.member.cypher_id, res.member.gid);
CreatedEdgeInfo UpdatesRpcClients::CreateEdge(
int this_worker_id, tx::TransactionId tx_id, VertexAccessor &from,
VertexAccessor &to, storage::EdgeType edge_type,
std::optional<int64_t> cypher_id) {
CHECK(from.address().is_remote()) << "In CreateEdge `from` must be remote";
int from_worker = from.address().worker_id();
auto res =
from.gid(), to.GlobalAddress(), edge_type, tx_id, cypher_id});
return CreatedEdgeInfo(res.member.cypher_id,
storage::EdgeAddress{res.member.gid, from_worker});
void UpdatesRpcClients::AddInEdge(int this_worker_id, tx::TransactionId tx_id,
VertexAccessor &from,
storage::EdgeAddress edge_address,
VertexAccessor &to,
storage::EdgeType edge_type) {
CHECK(to.address().is_remote() && edge_address.is_remote() &&
(from.GlobalAddress().worker_id() != to.address().worker_id()))
<< "AddInEdge should only be called when `to` is remote and "
"`from` is not on the same worker as `to`.";
auto worker_id = to.GlobalAddress().worker_id();
auto res = coordination_->GetClientPool(worker_id)->Call<AddInEdgeRpc>(
AddInEdgeReqData{this_worker_id, from.GlobalAddress(), edge_address,
to.gid(), edge_type, tx_id});
void UpdatesRpcClients::RemoveVertex(int this_worker_id, int to_worker_id,
tx::TransactionId tx_id, gid::Gid gid,
bool check_empty) {
auto res = coordination_->GetClientPool(to_worker_id)->Call<RemoveVertexRpc>(
RemoveVertexReqData{this_worker_id, gid, tx_id, check_empty});
void UpdatesRpcClients::RemoveEdge(int this_worker_id, int to_worker_id,
tx::TransactionId tx_id, gid::Gid edge_gid,
gid::Gid vertex_from_id,
storage::VertexAddress vertex_to_addr) {
auto res =
->Call<RemoveEdgeRpc>(RemoveEdgeData{this_worker_id, tx_id, edge_gid,
vertex_from_id, vertex_to_addr});
void UpdatesRpcClients::RemoveInEdge(int this_worker_id, int to_worker_id,
tx::TransactionId tx_id,
gid::Gid vertex_id,
storage::EdgeAddress edge_address) {
CHECK(edge_address.is_remote()) << "RemoveInEdge edge_address is local.";
auto res = coordination_->GetClientPool(to_worker_id)
this_worker_id, tx_id, vertex_id, edge_address});
std::vector<utils::Future<UpdateResult>> UpdatesRpcClients::UpdateApplyAll(
int skip_worker_id, tx::TransactionId tx_id) {
return coordination_->ExecuteOnWorkers<UpdateResult>(
skip_worker_id, [tx_id](int worker_id, auto &client) {
auto res = client.template Call<UpdateApplyRpc>(tx_id);
return res.member;
} // namespace distributed

View File

@ -1,82 +0,0 @@
#pragma once
#include <unordered_map>
#include <vector>
#include "distributed/coordination.hpp"
#include "distributed/updates_rpc_messages.hpp"
#include "durability/distributed/state_delta.hpp"
#include "query/typed_value.hpp"
#include "storage/common/types/types.hpp"
#include "storage/distributed/address_types.hpp"
#include "storage/distributed/gid.hpp"
#include "transactions/type.hpp"
#include "utils/future.hpp"
namespace distributed {
/// Exposes the functionality to send updates to other workers (that own the
/// graph element we are updating). Also enables us to call for a worker to
/// apply the accumulated deferred updates, or discard them.
class UpdatesRpcClients {
explicit UpdatesRpcClients(Coordination *coordination)
: coordination_(coordination) {}
/// Sends an update delta to the given worker.
UpdateResult Update(int this_worker_id, int to_worker_id,
const database::StateDelta &delta);
/// Creates a vertex on the given worker and returns it's id.
CreatedVertexInfo CreateVertex(
int worker_id, tx::TransactionId tx_id,
const std::vector<storage::Label> &labels,
const std::unordered_map<storage::Property, PropertyValue> &properties,
std::optional<int64_t> cypher_id = std::nullopt);
/// Creates an edge on the given worker and returns it's address. If the `to`
/// vertex is on the same worker as `from`, then all remote CRUD will be
/// handled by a call to this function. Otherwise a separate call to
/// `AddInEdge` might be necessary. Throws all the exceptions that can
/// occur remotely as a result of updating a vertex.
CreatedEdgeInfo CreateEdge(int this_worker_id, tx::TransactionId tx_id,
VertexAccessor &from, VertexAccessor &to,
storage::EdgeType edge_type,
std::optional<int64_t> cypher_id = std::nullopt);
// TODO (buda): Another machine in the cluster is asked to create an edge.
// cypher_id should be generated in that process. It probably doesn't make
// sense to have optional cypher id here. Maybe for the recovery purposes.
/// Adds the edge with the given address to the `to` vertex as an incoming
/// edge. Only used when `to` is remote and not on the same worker as `from`.
void AddInEdge(int this_worker_id, tx::TransactionId tx_id,
VertexAccessor &from, storage::EdgeAddress edge_address,
VertexAccessor &to, storage::EdgeType edge_type);
/// Removes a vertex from the other worker.
void RemoveVertex(int this_worker_id, int to_worker_id,
tx::TransactionId tx_id, gid::Gid gid, bool check_empty);
/// Removes an edge on another worker. This also handles the `from` vertex
/// outgoing edge, as that vertex is on the same worker as the edge. If the
/// `to` vertex is on the same worker, then that side is handled too by the
/// single RPC call, otherwise a separate call has to be made to
/// RemoveInEdge.
void RemoveEdge(int this_worker_id, int to_worker_id, tx::TransactionId tx_id,
gid::Gid edge_gid, gid::Gid vertex_from_id,
storage::VertexAddress vertex_to_addr);
void RemoveInEdge(int this_worker_id, int to_worker_id,
tx::TransactionId tx_id, gid::Gid vertex_id,
storage::EdgeAddress edge_address);
/// Calls for all the workers (except the given one) to apply their updates
/// and returns the future results.
std::vector<utils::Future<UpdateResult>> UpdateApplyAll(
int skip_worker_id, tx::TransactionId tx_id);
Coordination *coordination_;
} // namespace distributed

View File

@ -1,136 +0,0 @@
#pragma once
#include <unordered_map>
#include "communication/rpc/messages.hpp"
#include "database/distributed/serialization.hpp"
#include "durability/distributed/state_delta.hpp"
#include "storage/distributed/address_types.hpp"
#include "storage/distributed/gid.hpp"
#include "storage/distributed/rpc/serialization.hpp"
#include "transactions/type.hpp"
(lcp:namespace distributed)
(lcp:define-enum update-result
(:documentation "The result of sending or applying a deferred update to a worker.")
(lcp:define-rpc update
(:request ((member "::database::StateDelta")
(worker-id :int64_t)))
(:response ((member "UpdateResult"))))
(lcp:define-rpc update-apply
(:request ((member "::tx::TransactionId")))
(:response ((member "UpdateResult"))))
(lcp:define-struct create-result ()
((result "UpdateResult")
(cypher-id :int64_t :documentation "Only valid if creation was successful.")
(gid "::gid::Gid" :documentation "Only valid if creation was successful."))
(:serialize (:slk)))
(lcp:define-struct create-vertex-req-data ()
((tx-id "::tx::TransactionId")
(labels "std::vector<storage::Label>")
(properties "std::unordered_map<storage::Property, PropertyValue>")
(cypher-id "std::optional<int64_t>"))
(:serialize (:slk)))
(lcp:define-rpc create-vertex
(:request ((member "CreateVertexReqData")))
(:response ((member "CreateResult"))))
(lcp:define-struct create-edge-req-data ()
((worker-id :int64_t)
(from "::gid::Gid")
(to "::storage::VertexAddress")
(edge-type "::storage::EdgeType")
(tx-id "::tx::TransactionId")
(cypher-id "std::optional<int64_t>"))
(:serialize (:slk)))
(lcp:define-rpc create-edge
(:request ((member "CreateEdgeReqData")))
(:response ((member "CreateResult"))))
(lcp:define-struct add-in-edge-req-data ()
((worker-id :int64_t)
(from "::storage::VertexAddress")
(edge-address "::storage::EdgeAddress")
(to "::gid::Gid")
(edge-type "::storage::EdgeType")
(tx-id "::tx::TransactionId"))
(:serialize (:slk)))
(lcp:define-rpc add-in-edge
(:request ((member "AddInEdgeReqData")))
(:response ((member "UpdateResult"))))
(lcp:define-struct remove-vertex-req-data ()
((worker-id :int64_t)
(gid "::gid::Gid")
(tx-id "::tx::TransactionId")
(check-empty :bool))
(:serialize (:slk)))
(lcp:define-rpc remove-vertex
(:request ((member "RemoveVertexReqData")))
(:response ((member "UpdateResult"))))
(lcp:define-struct remove-edge-data ()
((worker-id :int64_t)
(tx-id "::tx::TransactionId")
(edge-id "::gid::Gid")
(vertex-from-id "::gid::Gid")
(vertex-to-address "::storage::VertexAddress"))
(:serialize (:slk)))
(lcp:define-rpc remove-edge
(:request ((member "RemoveEdgeData")))
(:response ((member "UpdateResult"))))
(lcp:define-struct remove-in-edge-data ()
((worker-id :int64_t)
(tx-id "::tx::TransactionId")
(vertex "::gid::Gid")
(edge-address "::storage::EdgeAddress"))
(:serialize (:slk)))
(lcp:define-rpc remove-in-edge
(:request ((member "RemoveInEdgeData")))
(:response ((member "UpdateResult"))))
(lcp:define-struct created-info ()
((cypher-id "int64_t")
(gid "::gid::Gid"))
(:public #>cpp
CreatedInfo(int64_t cypher_id, gid::Gid gid)
: cypher_id(cypher_id), gid(gid) {}
(lcp:define-struct created-vertex-info ()
((cypher-id "int64_t")
(gid "::gid::Gid"))
(:public #>cpp
CreatedVertexInfo(int64_t cypher_id, gid::Gid gid)
: cypher_id(cypher_id), gid(gid) {}
(lcp:define-struct created-edge-info ()
((cypher-id "int64_t")
(edge-address "::storage::EdgeAddress"))
(:public #>cpp
CreatedEdgeInfo(int64_t cypher_id, storage::EdgeAddress edge_address)
: cypher_id(cypher_id), edge_address(edge_address) {}
(lcp:pop-namespace) ;; distributed

View File

@ -1,495 +0,0 @@
#include "distributed/updates_rpc_server.hpp"
#include <utility>
#include <glog/logging.h>
#include "utils/thread/sync.hpp"
namespace distributed {
template <typename TRecordAccessor>
UpdateResult UpdatesRpcServer::TransactionUpdates<TRecordAccessor>::Emplace(
const database::StateDelta &delta, int worker_id) {
auto gid = std::is_same<TRecordAccessor, VertexAccessor>::value
? delta.vertex_id
: delta.edge_id;
std::lock_guard<utils::SpinLock> guard{lock_};
auto found = deltas_.find(gid);
if (found == deltas_.end()) {
found = deltas_
.emplace(gid, std::make_pair(FindAccessor(gid),
found->second.second.emplace_back(delta, worker_id);
// TODO call `RecordAccessor::update` to force serialization errors to
// fail-fast (as opposed to when all the deltas get applied).
// This is problematic because `VersionList::update` needs to become
// thread-safe within the same transaction. Note that the concurrency is
// possible both between the owner worker interpretation thread and an RPC
// thread (current thread), as well as multiple RPC threads if this
// object's lock is released (perhaps desirable).
// A potential solution *might* be that `LockStore::Lock` returns a `bool`
// indicating if the caller was the one obtaining the lock (not the same
// as lock already being held by the same transaction).
// Another thing that needs to be done (if we do this) is ensuring that
// `LockStore::Take` is thread-safe when called in parallel in the same
// transaction. Currently it's thread-safe only when called in parallel
// from different transactions (only one manages to take the RecordLock).
// Deferring the implementation of this as it's tricky, and essentially an
// optimization.
// try {
// found->second.first.update();
// } catch (const mvcc::SerializationError &) {
// return UpdateResult::SERIALIZATION_ERROR;
// } catch (const RecordDeletedError &) {
// return UpdateResult::UPDATE_DELETED_ERROR;
// } catch (const utils::LockTimeoutException &) {
// return UpdateResult::LOCK_TIMEOUT_ERROR;
// }
return UpdateResult::DONE;
template <typename TRecordAccessor>
CreatedInfo UpdatesRpcServer::TransactionUpdates<TRecordAccessor>::CreateVertex(
const std::vector<storage::Label> &labels,
const std::unordered_map<storage::Property, PropertyValue> &properties,
std::optional<int64_t> cypher_id) {
auto result = db_accessor_->InsertVertex(std::nullopt, cypher_id);
for (auto &label : labels) result.add_label(label);
for (auto &kv : properties) result.PropsSet(kv.first, kv.second);
std::lock_guard<utils::SpinLock> guard{lock_};
std::make_pair(result, std::vector<DeltaPair>{}));
return CreatedInfo(result.CypherId(), result.gid());
template <typename TRecordAccessor>
CreatedInfo UpdatesRpcServer::TransactionUpdates<TRecordAccessor>::CreateEdge(
gid::Gid from, storage::VertexAddress to, storage::EdgeType edge_type,
int worker_id, std::optional<int64_t> cypher_id) {
auto &db = db_accessor_->db();
auto from_addr =
storage::VertexAddress(from, worker_id));
auto to_addr =;
auto edge = db_accessor_->InsertOnlyEdge(from_addr, to_addr, edge_type,
std::nullopt, cypher_id);
std::lock_guard<utils::SpinLock> guard{lock_};
std::make_pair(edge, std::vector<DeltaPair>{}));
return CreatedInfo(edge.CypherId(), edge.gid());
template <typename TRecordAccessor>
UpdateResult UpdatesRpcServer::TransactionUpdates<TRecordAccessor>::Apply() {
std::lock_guard<utils::SpinLock> guard{lock_};
for (auto &kv : deltas_) {
auto &record_accessor = kv.second.first;
// We need to reconstruct the record as in the meantime some local
// update might have updated it.
for (auto &pair : kv.second.second) {
auto delta =;
try {
auto &dba = *db_accessor_;
switch (delta.type) {
case database::StateDelta::Type::TRANSACTION_BEGIN:
case database::StateDelta::Type::TRANSACTION_COMMIT:
case database::StateDelta::Type::TRANSACTION_ABORT:
case database::StateDelta::Type::CREATE_VERTEX:
case database::StateDelta::Type::CREATE_EDGE:
case database::StateDelta::Type::BUILD_INDEX:
LOG(FATAL) << "Can only apply record update deltas for remote "
"graph element";
case database::StateDelta::Type::REMOVE_VERTEX:
if (!db_accessor().RemoveVertex(
reinterpret_cast<VertexAccessor &>(record_accessor),
delta.check_empty)) {
case database::StateDelta::Type::SET_PROPERTY_VERTEX:
case database::StateDelta::Type::SET_PROPERTY_EDGE:
record_accessor.PropsSet(, delta.value);
case database::StateDelta::Type::ADD_LABEL:
reinterpret_cast<VertexAccessor &>(record_accessor)
case database::StateDelta::Type::REMOVE_LABEL:
reinterpret_cast<VertexAccessor &>(record_accessor)
case database::StateDelta::Type::ADD_OUT_EDGE:
reinterpret_cast<Vertex &>(*record_accessor.GetNew())
case database::StateDelta::Type::ADD_IN_EDGE:
reinterpret_cast<Vertex &>(*record_accessor.GetNew())
case database::StateDelta::Type::REMOVE_EDGE:
// We only remove the edge as a result of this StateDelta,
// because the removal of edge from vertex in/out is performed
// in REMOVE_[IN/OUT]_EDGE deltas.
reinterpret_cast<EdgeAccessor &>(record_accessor), false,
case database::StateDelta::Type::REMOVE_OUT_EDGE:
reinterpret_cast<VertexAccessor &>(record_accessor)
case database::StateDelta::Type::REMOVE_IN_EDGE:
reinterpret_cast<VertexAccessor &>(record_accessor)
} catch (const mvcc::SerializationError &) {
return UpdateResult::SERIALIZATION_ERROR;
} catch (const RecordDeletedError &) {
return UpdateResult::UPDATE_DELETED_ERROR;
} catch (const utils::LockTimeoutException &) {
return UpdateResult::LOCK_TIMEOUT_ERROR;
return UpdateResult::DONE;
template <typename TRecordAccessor>
void UpdatesRpcServer::TransactionUpdates<TRecordAccessor>::ApplyDeltasToRecord(
gid::Gid gid, int worker_id, TRecord **old, TRecord **newr) {
std::lock_guard<utils::SpinLock> guard{lock_};
auto found = deltas_.find(gid);
if (found == deltas_.end()) return;
auto update = [](auto **old, auto **newr) {
if (!*newr) {
DCHECK(*old) << "Trying to create new record but pointer to old record "
"is nullptr.";
*newr = (*old)->CloneData();
for (auto &pair : found->second.second) {
auto delta =;
if (worker_id != pair.worker_id) continue;
switch (delta.type) {
case database::StateDelta::Type::SET_PROPERTY_VERTEX:
case database::StateDelta::Type::SET_PROPERTY_EDGE:
update(old, newr);
(*newr)->properties_.set(, delta.value);
case database::StateDelta::Type::ADD_LABEL: {
update(old, newr);
auto &labels = reinterpret_cast<Vertex *>(*newr)->labels_;
if (!utils::Contains(labels, delta.label)) {
case database::StateDelta::Type::REMOVE_LABEL: {
update(old, newr);
auto &labels = reinterpret_cast<Vertex *>(*newr)->labels_;
auto found = std::find(labels.begin(), labels.end(), delta.label);
if (found == labels.end()) continue;
std::swap(*found, labels.back());
case database::StateDelta::Type::ADD_OUT_EDGE:
update(old, newr);
reinterpret_cast<Vertex *>(*newr)->out_.emplace(
delta.vertex_to_address, delta.edge_address, delta.edge_type);
case database::StateDelta::Type::ADD_IN_EDGE:
update(old, newr);
reinterpret_cast<Vertex *>(*newr)->in_.emplace(
delta.vertex_from_address, delta.edge_address, delta.edge_type);
case database::StateDelta::Type::REMOVE_OUT_EDGE:
update(old, newr);
reinterpret_cast<Vertex *>(*newr)->out_.RemoveEdge(delta.edge_address);
case database::StateDelta::Type::REMOVE_IN_EDGE:
update(old, newr);
reinterpret_cast<Vertex *>(*newr)->in_.RemoveEdge(delta.edge_address);
// Effects of REMOVE VERTEX and REMOVE EDGE aren't visible in the
// current command id so we can safely ignore this case.
// Other deltas we're ignoring don't update record.
UpdatesRpcServer::UpdatesRpcServer(database::GraphDb *db,
distributed::Coordination *coordination)
: db_(db) {
coordination->Register<UpdateRpc>([this](auto *req_reader,
auto *res_builder) {
UpdateReq req;
slk::Load(&req, req_reader);
using DeltaType = database::StateDelta::Type;
auto &delta = req.member;
switch (delta.type) {
case DeltaType::ADD_LABEL:
case DeltaType::REMOVE_LABEL:
case database::StateDelta::Type::REMOVE_OUT_EDGE:
case database::StateDelta::Type::REMOVE_IN_EDGE: {
UpdateRes res(GetUpdates(vertex_updates_, delta.transaction_id)
.Emplace(delta, req.worker_id));
slk::Save(res, res_builder);
case DeltaType::SET_PROPERTY_EDGE: {
UpdateRes res(GetUpdates(edge_updates_, delta.transaction_id)
.Emplace(delta, req.worker_id));
slk::Save(res, res_builder);
LOG(FATAL) << "Can't perform a remote update with delta type: "
<< static_cast<int>(req.member.type);
[this](auto *req_reader, auto *res_builder) {
UpdateApplyReq req;
slk::Load(&req, req_reader);
UpdateApplyRes res(Apply(req.member));
slk::Save(res, res_builder);
coordination->Register<CreateVertexRpc>([this](auto *req_reader,
auto *res_builder) {
CreateVertexReq req;
slk::Load(&req, req_reader);
auto result = GetUpdates(vertex_updates_, req.member.tx_id)
CreateVertexRes res(
CreateResult{UpdateResult::DONE, result.cypher_id, result.gid});
slk::Save(res, res_builder);
[this](auto *req_reader, auto *res_builder) {
CreateEdgeReq req;
slk::Load(&req, req_reader);
auto data = req.member;
auto creation_result = CreateEdge(data);
// If `from` and `to` are both on this worker, we handle it in this
// RPC call. Do it only if CreateEdge succeeded.
if (creation_result.result == UpdateResult::DONE && == db_->WorkerId()) {
auto to_delta = database::StateDelta::AddInEdge(
data.tx_id,, {data.from, db_->WorkerId()},
{creation_result.gid, db_->WorkerId()}, data.edge_type);
creation_result.result =
GetUpdates(vertex_updates_, data.tx_id)
.Emplace(to_delta, data.worker_id);
CreateEdgeRes res(creation_result);
slk::Save(res, res_builder);
[this](auto *req_reader, auto *res_builder) {
AddInEdgeReq req;
slk::Load(&req, req_reader);
auto to_delta = database::StateDelta::AddInEdge(
req.member.tx_id,, req.member.from,
req.member.edge_address, req.member.edge_type);
auto result = GetUpdates(vertex_updates_, req.member.tx_id)
.Emplace(to_delta, req.member.worker_id);
AddInEdgeRes res(result);
slk::Save(res, res_builder);
[this](auto *req_reader, auto *res_builder) {
RemoveVertexReq req;
slk::Load(&req, req_reader);
auto to_delta = database::StateDelta::RemoveVertex(
req.member.tx_id, req.member.gid, req.member.check_empty);
auto result = GetUpdates(vertex_updates_, req.member.tx_id)
.Emplace(to_delta, req.member.worker_id);
RemoveVertexRes res(result);
slk::Save(res, res_builder);
[this](auto *req_reader, auto *res_builder) {
RemoveEdgeReq req;
slk::Load(&req, req_reader);
RemoveEdgeRes res(RemoveEdge(req.member));
slk::Save(res, res_builder);
coordination->Register<RemoveInEdgeRpc>([this](auto *req_reader,
auto *res_builder) {
RemoveInEdgeReq req;
slk::Load(&req, req_reader);
auto data = req.member;
RemoveInEdgeRes res(
GetUpdates(vertex_updates_, data.tx_id)
.Emplace(database::StateDelta::RemoveInEdge(data.tx_id, data.vertex,
slk::Save(res, res_builder);
UpdateResult UpdatesRpcServer::Apply(tx::TransactionId tx_id) {
auto apply = [tx_id](auto &collection) {
auto access = collection.access();
auto found = access.find(tx_id);
if (found == access.end()) {
return UpdateResult::DONE;
auto result = found->second.Apply();
return result;
auto vertex_result = apply(vertex_updates_);
auto edge_result = apply(edge_updates_);
if (vertex_result != UpdateResult::DONE) return vertex_result;
if (edge_result != UpdateResult::DONE) return edge_result;
return UpdateResult::DONE;
template <>
void UpdatesRpcServer::ApplyDeltasToRecord<Vertex>(tx::TransactionId tx_id,
gid::Gid gid, int worker_id,
Vertex **old,
Vertex **newr) {
auto access = vertex_updates_.access();
auto found = access.find(tx_id);
if (found != access.end())
found->second.ApplyDeltasToRecord(gid, worker_id, old, newr);
template <>
void UpdatesRpcServer::ApplyDeltasToRecord<Edge>(tx::TransactionId tx_id,
gid::Gid gid, int worker_id,
Edge **old,
Edge **newr) {
auto access = edge_updates_.access();
auto found = access.find(tx_id);
if (found != access.end())
found->second.ApplyDeltasToRecord(gid, worker_id, old, newr);
void UpdatesRpcServer::ClearTransactionalCache(
tx::TransactionId oldest_active) {
auto vertex_access = vertex_updates_.access();
for (auto &kv : vertex_access) {
if (kv.first < oldest_active) {
auto edge_access = edge_updates_.access();
for (auto &kv : edge_access) {
if (kv.first < oldest_active) {
// Gets/creates the TransactionUpdates for the given transaction.
template <typename TAccessor>
UpdatesRpcServer::TransactionUpdates<TAccessor> &UpdatesRpcServer::GetUpdates(
MapT<TAccessor> &updates, tx::TransactionId tx_id) {
return updates.access()
.emplace(tx_id, std::make_tuple(tx_id),
std::make_tuple(std::ref(db_), tx_id))
CreateResult UpdatesRpcServer::CreateEdge(const CreateEdgeReqData &req) {
auto ids = GetUpdates(edge_updates_, req.tx_id)
.CreateEdge(req.from,, req.edge_type, db_->WorkerId(),
// cypher_id doesn't have to be inserted because edge is stored
// somewhere else in the cluster. Here is only vertex update.
auto from_delta = database::StateDelta::AddOutEdge(
req.tx_id, req.from,, {ids.gid, db_->WorkerId()}, req.edge_type);
auto result = GetUpdates(vertex_updates_, req.tx_id)
.Emplace(from_delta, req.worker_id);
return {result, ids.cypher_id, ids.gid};
UpdateResult UpdatesRpcServer::RemoveEdge(const RemoveEdgeData &data) {
// Edge removal.
auto deletion_delta =
database::StateDelta::RemoveEdge(data.tx_id, data.edge_id);
auto result = GetUpdates(edge_updates_, data.tx_id)
.Emplace(deletion_delta, data.worker_id);
// Out-edge removal, for sure is local.
if (result == UpdateResult::DONE) {
auto remove_out_delta = database::StateDelta::RemoveOutEdge(
data.tx_id, data.vertex_from_id, {data.edge_id, db_->WorkerId()});
result = GetUpdates(vertex_updates_, data.tx_id)
.Emplace(remove_out_delta, data.worker_id);
// In-edge removal, might not be local.
if (result == UpdateResult::DONE &&
data.vertex_to_address.worker_id() == db_->WorkerId()) {
auto remove_in_delta = database::StateDelta::RemoveInEdge(
data.tx_id, data.vertex_to_address.gid(),
{data.edge_id, db_->WorkerId()});
result = GetUpdates(vertex_updates_, data.tx_id)
.Emplace(remove_in_delta, data.worker_id);
return result;
template <>
gid::Gid gid) {
return db_accessor_->FindVertex(gid, false);
template <>
EdgeAccessor UpdatesRpcServer::TransactionUpdates<EdgeAccessor>::FindAccessor(
gid::Gid gid) {
return db_accessor_->FindEdge(gid, false);
} // namespace distributed

View File

@ -1,136 +0,0 @@
/// @file
#pragma once
#include <unordered_map>
#include <vector>
#include "glog/logging.h"
#include "data_structures/concurrent/concurrent_map.hpp"
#include "database/distributed/distributed_graph_db.hpp"
#include "database/distributed/graph_db_accessor.hpp"
#include "distributed/coordination.hpp"
#include "distributed/updates_rpc_messages.hpp"
#include "durability/distributed/state_delta.hpp"
#include "query/typed_value.hpp"
#include "storage/common/types/types.hpp"
#include "storage/distributed/edge_accessor.hpp"
#include "storage/distributed/gid.hpp"
#include "storage/distributed/vertex_accessor.hpp"
#include "transactions/type.hpp"
#include "utils/thread/sync.hpp"
namespace distributed {
/// An RPC server that accepts and holds deferred updates (deltas) until it's
/// told to apply or discard them. The updates are organized and applied per
/// transaction in this single updates server.
/// Attempts to get serialization and update-after-delete errors to happen as
/// soon as possible during query execution (fail fast).
class UpdatesRpcServer {
// Remote updates for one transaction.
template <typename TRecordAccessor>
class TransactionUpdates {
struct DeltaPair {
DeltaPair(const database::StateDelta &delta, int worker_id)
: delta(delta), worker_id(worker_id) {}
database::StateDelta delta;
int worker_id;
using TRecord = typename std::remove_pointer<decltype(
TransactionUpdates(database::GraphDb *db,
tx::TransactionId tx_id)
: db_accessor_(db->Access(tx_id)) {}
/// Adds a delta and returns the result. Does not modify the state (data)
/// of the graph element the update is for, but calls the `update` method
/// to fail-fast on serialization and update-after-delete errors.
UpdateResult Emplace(const database::StateDelta &delta, int worker_id);
/// Creates a new vertex and returns it's cypher_id and gid.
CreatedInfo CreateVertex(
const std::vector<storage::Label> &labels,
const std::unordered_map<storage::Property, PropertyValue> &properties,
std::optional<int64_t> cypher_id = std::nullopt);
/// Creates a new edge and returns it's cypher_id and gid. Does not update
/// vertices at the end of the edge.
CreatedInfo CreateEdge(gid::Gid from, storage::VertexAddress to,
storage::EdgeType edge_type, int worker_id,
std::optional<int64_t> cypher_id = std::nullopt);
/// Applies all the deltas on the record.
UpdateResult Apply();
/// Applies all deltas made by certain worker to given old and new record.
/// This method could change newr pointer, and if it does it wont free that
/// memory. In case that update method needs to be called on records, new
/// record will be created by calling CloneData on old record. Caller
/// has to make sure to free that memory.
void ApplyDeltasToRecord(gid::Gid gid, int worker_id, TRecord **old,
TRecord **newr);
auto &db_accessor() { return *db_accessor_; }
std::unique_ptr<database::GraphDbAccessor> db_accessor_;
std::pair<TRecordAccessor, std::vector<DeltaPair>>>
// Multiple workers might be sending remote updates concurrently.
utils::SpinLock lock_;
// Helper method specialized for [Vertex|Edge]Accessor.
TRecordAccessor FindAccessor(gid::Gid gid);
UpdatesRpcServer(database::GraphDb *db,
distributed::Coordination *coordination);
/// Applies all existsing updates for the given transaction ID. If there are
/// no updates for that transaction, nothing happens. Clears the updates
/// cache after applying them, regardless of the result.
UpdateResult Apply(tx::TransactionId tx_id);
/// Applies all deltas made by certain worker to given old and new record.
/// This method could change newr pointer, and if it does it wont free that
/// memory. In case that update method needs to be called on records, new
/// record will be created by calling CloneData on old record. Caller
/// has to make sure to free that memory.
template <typename TRecord>
void ApplyDeltasToRecord(tx::TransactionId tx_id, gid::Gid, int worker_id,
TRecord **old, TRecord **newr);
/// Clears the cache of local transactions that are completed. The signature
/// of this method is dictated by `distributed::TransactionalCacheCleaner`.
void ClearTransactionalCache(tx::TransactionId oldest_active);
database::GraphDb *db_;
template <typename TAccessor>
using MapT = ConcurrentMap<tx::TransactionId, TransactionUpdates<TAccessor>>;
MapT<VertexAccessor> vertex_updates_;
MapT<EdgeAccessor> edge_updates_;
// Gets/creates the TransactionUpdates for the given transaction.
template <typename TAccessor>
TransactionUpdates<TAccessor> &GetUpdates(MapT<TAccessor> &updates,
tx::TransactionId tx_id);
// Performs edge creation for the given request.
CreateResult CreateEdge(const CreateEdgeReqData &req);
// Performs edge removal for the given request.
UpdateResult RemoveEdge(const RemoveEdgeData &data);
} // namespace distributed

View File

@ -1,92 +0,0 @@
#include "durability/distributed/paths.hpp"
#include <filesystem>
#include <optional>
#include <string>
#include "glog/logging.h"
#include "transactions/type.hpp"
#include "utils/string.hpp"
#include "utils/timestamp.hpp"
namespace durability {
namespace fs = std::filesystem;
std::optional<tx::TransactionId> TransactionIdFromWalFilename(
const std::string &name) {
auto nullopt = std::nullopt;
// Get the max_transaction_id from the file name that has format
// "XXXXX__max_transaction_<MAX_TRANS_ID>_worker_<Worker_ID>"
auto file_name_split = utils::RSplit(name, "__", 1);
if (file_name_split.size() != 2) {
LOG(WARNING) << "Unable to parse WAL file name: " << name;
return nullopt;
if (utils::StartsWith(file_name_split[1], "current"))
return std::numeric_limits<tx::TransactionId>::max();
file_name_split = utils::Split(file_name_split[1], "_");
if (file_name_split.size() != 5) {
LOG(WARNING) << "Unable to parse WAL file name: " << name;
return nullopt;
auto &tx_id_str = file_name_split[2];
try {
return std::stoll(tx_id_str);
} catch (std::invalid_argument &) {
LOG(WARNING) << "Unable to parse WAL file name tx ID: " << tx_id_str;
return nullopt;
} catch (std::out_of_range &) {
LOG(WARNING) << "WAL file name tx ID too large: " << tx_id_str;
return nullopt;
fs::path MakeSnapshotPath(const fs::path &durability_dir, const int worker_id,
tx::TransactionId tx_id) {
std::string date_str =
auto file_name = date_str + "_worker_" + std::to_string(worker_id) + "_tx_" +
return durability_dir / kSnapshotDir / file_name;
/// Generates a file path for a write-ahead log file. If given a transaction ID
/// the file name will contain it. Otherwise the file path is for the "current"
/// WAL file for which the max tx id is still unknown.
fs::path WalFilenameForTransactionId(const std::filesystem::path &wal_dir,
int worker_id,
std::optional<tx::TransactionId> tx_id) {
auto file_name = utils::Timestamp::Now().ToIso8601();
if (tx_id) {
file_name += "__max_transaction_" + std::to_string(*tx_id);
} else {
file_name += "__current";
file_name = file_name + "_Worker_" + std::to_string(worker_id);
return wal_dir / file_name;
std::optional<tx::TransactionId> TransactionIdFromSnapshotFilename(
const std::string &name) {
auto nullopt = std::nullopt;
auto file_name_split = utils::RSplit(name, "_tx_", 1);
if (file_name_split.size() != 2) {
LOG(WARNING) << "Unable to parse snapshot file name: " << name;
return nullopt;
try {
return std::stoll(file_name_split[1]);
} catch (std::invalid_argument &) {
LOG(WARNING) << "Unable to parse snapshot file name tx ID: "
<< file_name_split[1];
return nullopt;
} catch (std::out_of_range &) {
LOG(WARNING) << "Unable to parse snapshot file name tx ID: "
<< file_name_split[1];
return nullopt;
} // namespace durability

View File

@ -1,40 +0,0 @@
#pragma once
#include <filesystem>
#include <optional>
#include "transactions/type.hpp"
namespace durability {
const std::string kSnapshotDir = "snapshots";
const std::string kWalDir = "wal";
const std::string kBackupDir = ".backup";
/// Returns the transaction id contained in the file name. If the filename is
/// not a parseable WAL file name, nullopt is returned. If the filename
/// represents the "current" WAL file, then the maximum possible transaction ID
/// is returned because that's appropriate for the recovery logic (the current
/// WAL does not yet have a maximum transaction ID and can't be discarded by
/// the recovery regardless of the snapshot from which the transaction starts).
std::optional<tx::TransactionId> TransactionIdFromWalFilename(
const std::string &name);
/** Generates a path for a DB snapshot in the given folder in a well-defined
* sortable format with worker id and transaction from which the snapshot is
* created appended to the file name. */
std::filesystem::path MakeSnapshotPath(
const std::filesystem::path &durability_dir, int worker_id,
tx::TransactionId tx_id);
/// Returns the transaction id contained in the file name. If the filename is
/// not a parseable WAL file name, nullopt is returned.
std::optional<tx::TransactionId> TransactionIdFromSnapshotFilename(
const std::string &name);
/// Generates a file path for a write-ahead log file of a specified worker. If
/// given a transaction ID the file name will contain it. Otherwise the file
/// path is for the "current" WAL file for which the max tx id is still unknown.
std::filesystem::path WalFilenameForTransactionId(
const std::filesystem::path &wal_dir, int worker_id,
std::optional<tx::TransactionId> tx_id = std::nullopt);
} // namespace durability

View File

@ -1,502 +0,0 @@
#include "durability/distributed/recovery.hpp"
#include <filesystem>
#include <limits>
#include <unordered_map>
#include "database/distributed/graph_db_accessor.hpp"
#include "durability/distributed/paths.hpp"
#include "durability/distributed/snapshot_decoder.hpp"
#include "durability/distributed/snapshot_value.hpp"
#include "durability/distributed/version.hpp"
#include "durability/distributed/wal.hpp"
#include "durability/hashed_file_reader.hpp"
#include "glue/communication.hpp"
#include "storage/distributed/address_types.hpp"
#include "storage/distributed/indexes/label_property_index.hpp"
#include "transactions/type.hpp"
#include "utils/algorithm.hpp"
#include "utils/file.hpp"
namespace fs = std::filesystem;
namespace durability {
using communication::bolt::Value;
bool ReadSnapshotSummary(HashedFileReader &buffer, int64_t &vertex_count,
int64_t &edge_count, uint64_t &hash) {
auto pos = buffer.Tellg();
auto offset = sizeof(vertex_count) + sizeof(edge_count) + sizeof(hash);
buffer.Seek(-offset, std::ios_base::end);
bool r_val = buffer.ReadType(vertex_count, false) &&
buffer.ReadType(edge_count, false) &&
buffer.ReadType(hash, false);
return r_val;
bool VersionConsistency(const fs::path &durability_dir) {
for (const auto &durability_type : {kSnapshotDir, kWalDir}) {
auto recovery_dir = durability_dir / durability_type;
if (!fs::exists(recovery_dir) || !fs::is_directory(recovery_dir)) continue;
for (const auto &file : fs::directory_iterator(recovery_dir)) {
HashedFileReader reader;
SnapshotDecoder<HashedFileReader> decoder(reader);
// The following checks are ok because we are only trying to detect
// version inconsistencies.
if (!reader.Open(fs::path(file))) continue;
std::array<uint8_t, 4> target_magic_number =
(durability_type == kSnapshotDir) ? durability::kSnapshotMagic
: durability::kWalMagic;
std::array<uint8_t, 4> magic_number;
if (!reader.Read(, magic_number.size())) continue;
if (magic_number != target_magic_number) continue;
if (reader.EndOfFile()) continue;
Value dv;
if (!decoder.ReadValue(&dv, Value::Type::Int) ||
dv.ValueInt() != durability::kVersion)
return false;
return true;
bool DistributedVersionConsistency(const int64_t master_version) {
return durability::kVersion == master_version;
bool ContainsDurabilityFiles(const fs::path &durability_dir) {
for (const auto &durability_type : {kSnapshotDir, kWalDir}) {
auto recovery_dir = durability_dir / durability_type;
if (fs::exists(recovery_dir) && fs::is_directory(recovery_dir) &&
return true;
return false;
void MoveToBackup(const fs::path &durability_dir) {
auto backup_dir = durability_dir / kBackupDir;
utils::EnsureDirOrDie(backup_dir / kSnapshotDir);
utils::EnsureDirOrDie(backup_dir / kWalDir);
for (const auto &durability_type : {kSnapshotDir, kWalDir}) {
auto recovery_dir = durability_dir / durability_type;
if (!fs::exists(recovery_dir) || !fs::is_directory(recovery_dir)) continue;
for (const auto &file : fs::directory_iterator(recovery_dir)) {
auto filename = fs::path(file).filename();
fs::rename(file, backup_dir / durability_type / filename);
namespace {
using communication::bolt::Value;
#define RETURN_IF_NOT(condition) \
if (!(condition)) { \
reader.Close(); \
return false; \
bool RecoverSnapshot(const fs::path &snapshot_file, database::GraphDb *db,
RecoveryData *recovery_data, int worker_id) {
HashedFileReader reader;
SnapshotDecoder<HashedFileReader> decoder(reader);
auto magic_number = durability::kSnapshotMagic;
reader.Read(, magic_number.size());
RETURN_IF_NOT(magic_number == durability::kSnapshotMagic);
// Read the vertex and edge count, and the hash, from the end of the snapshot.
int64_t vertex_count;
int64_t edge_count;
uint64_t hash;
durability::ReadSnapshotSummary(reader, vertex_count, edge_count, hash));
Value dv;
RETURN_IF_NOT(decoder.ReadValue(&dv, Value::Type::Int) &&
dv.ValueInt() == durability::kVersion);
// Checks worker id was set correctly
RETURN_IF_NOT(decoder.ReadValue(&dv, Value::Type::Int) &&
dv.ValueInt() == worker_id);
// Vertex and edge generator ids
RETURN_IF_NOT(decoder.ReadValue(&dv, Value::Type::Int));
uint64_t vertex_generator_cnt = dv.ValueInt();
db->storage().VertexGenerator().LocalCount(), vertex_generator_cnt));
RETURN_IF_NOT(decoder.ReadValue(&dv, Value::Type::Int));
uint64_t edge_generator_cnt = dv.ValueInt();
std::max(db->storage().EdgeGenerator().LocalCount(), edge_generator_cnt));
RETURN_IF_NOT(decoder.ReadValue(&dv, Value::Type::Int));
recovery_data->snapshooter_tx_id = dv.ValueInt();
// Transaction snapshot of the transaction that created the snapshot.
RETURN_IF_NOT(decoder.ReadValue(&dv, Value::Type::List));
for (const auto &value : dv.ValueList()) {
// A list of label+property indexes.
RETURN_IF_NOT(decoder.ReadValue(&dv, Value::Type::List));
auto index_value = dv.ValueList();
for (auto it = index_value.begin(); it != index_value.end();) {
auto label = *it++;
RETURN_IF_NOT(it != index_value.end());
auto property = *it++;
RETURN_IF_NOT(label.IsString() && property.IsString());
auto dba = db->Access();
std::pair<storage::VertexAddress, storage::VertexAddress>>
for (int64_t i = 0; i < vertex_count; ++i) {
auto vertex = decoder.ReadSnapshotVertex();
auto vertex_accessor = dba->InsertVertex(vertex->gid, vertex->cypher_id);
for (const auto &label : vertex->labels) {
for (const auto &property_pair : vertex->properties) {
auto vertex_record = vertex_accessor.GetNew();
for (const auto &edge : vertex->in) {
vertex_record->in_.emplace(edge.vertex, edge.address,
edge_gid_endpoints_mapping[edge.address.gid()] = {
edge.vertex, vertex_accessor.GlobalAddress()};
for (const auto &edge : vertex->out) {
vertex_record->out_.emplace(edge.vertex, edge.address,
edge_gid_endpoints_mapping[edge.address.gid()] = {
vertex_accessor.GlobalAddress(), edge.vertex};
auto vertex_transform_to_local_if_possible =
[&dba, worker_id](storage::VertexAddress &address) {
if (address.is_local()) return;
// If the worker id matches it should be a local apperance
if (address.worker_id() == worker_id) {
address = storage::VertexAddress(
CHECK(address.is_local()) << "Address should be local but isn't";
auto edge_transform_to_local_if_possible =
[&dba, worker_id](storage::EdgeAddress &address) {
if (address.is_local()) return;
// If the worker id matches it should be a local apperance
if (address.worker_id() == worker_id) {
address = storage::EdgeAddress(
CHECK(address.is_local()) << "Address should be local but isn't";
Value dv_cypher_id;
for (int64_t i = 0; i < edge_count; ++i) {
decoder.ReadValue(&dv, communication::bolt::Value::Type::Edge));
auto &edge = dv.ValueEdge();
// Read cypher_id
auto cypher_id = dv_cypher_id.ValueInt();
// We have to take full edge endpoints from vertices since the endpoints
// found here don't containt worker_id, and this can't be changed since this
// edges must be bolt-compliant
auto &edge_endpoints = edge_gid_endpoints_mapping[];
storage::VertexAddress from;
storage::VertexAddress to;
std::tie(from, to) = edge_endpoints;
// From and to are written in the global_address format and we should
// convert them back to local format for speedup - if possible
auto edge_accessor = dba->InsertOnlyEdge(from, to, dba->EdgeType(edge.type),, cypher_id);
for (const auto &property_pair :
// Vertex and edge counts are included in the hash. Re-read them to update the
// hash.
if (!reader.Close() || reader.hash() != hash) {
return false;
// We have to replace global_ids with local ids where possible for all edges
// in every vertex and this can only be done after we inserted the edges; this
// is to speedup execution
for (auto &vertex_accessor : dba->Vertices(true)) {
auto vertex = vertex_accessor.GetNew();
auto iterate_and_transform =
edge_transform_to_local_if_possible](Edges &edges) {
Edges transformed;
for (auto &element : edges) {
auto vertex = element.vertex;
auto edge = element.edge;
transformed.emplace(vertex, edge, element.edge_type);
return transformed;
vertex->in_ = iterate_and_transform(vertex->in_);
vertex->out_ = iterate_and_transform(vertex->out_);
// Ensure that the next transaction ID in the recovered DB will be greater
// than the latest one we have recovered. Do this to make sure that
// subsequently created snapshots and WAL files will have transactional info
// that does not interfere with that found in previous snapshots and WAL.
tx::TransactionId max_id = recovery_data->snapshooter_tx_id;
auto &snap = recovery_data->snapshooter_tx_snapshot;
if (!snap.empty()) {
max_id = std::max(max_id, *std::max_element(snap.begin(), snap.end()));
return true;
std::vector<fs::path> GetWalFiles(const fs::path &wal_dir) {
// Get paths to all the WAL files and sort them (on date).
std::vector<fs::path> wal_files;
if (!fs::exists(wal_dir)) return {};
for (auto &wal_file : fs::directory_iterator(wal_dir))
std::sort(wal_files.begin(), wal_files.end());
return wal_files;
bool ApplyOverDeltas(
const std::vector<fs::path> &wal_files, tx::TransactionId first_to_recover,
const std::function<void(const database::StateDelta &)> &f) {
for (auto &wal_file : wal_files) {
auto wal_file_max_tx_id = TransactionIdFromWalFilename(wal_file.filename());
if (!wal_file_max_tx_id || *wal_file_max_tx_id < first_to_recover) continue;
HashedFileReader wal_reader;
if (!wal_reader.Open(wal_file)) return false;
communication::bolt::Decoder<HashedFileReader> decoder(wal_reader);
auto magic_number = durability::kWalMagic;
wal_reader.Read(, magic_number.size());
if (magic_number != durability::kWalMagic) return false;
Value dv;
if (!decoder.ReadValue(&dv, Value::Type::Int) ||
dv.ValueInt() != durability::kVersion)
return false;
while (true) {
auto delta = database::StateDelta::Decode(wal_reader, decoder);
if (!delta) break;
return true;
auto FirstWalTxToRecover(const RecoveryData &recovery_data) {
auto &tx_sn = recovery_data.snapshooter_tx_snapshot;
auto first_to_recover = tx_sn.empty() ? recovery_data.snapshooter_tx_id + 1
: *std::min(tx_sn.begin(), tx_sn.end());
return first_to_recover;
std::vector<tx::TransactionId> ReadWalRecoverableTransactions(
const fs::path &wal_dir, database::GraphDb *db,
const RecoveryData &recovery_data) {
auto wal_files = GetWalFiles(wal_dir);
std::unordered_set<tx::TransactionId> committed_set;
auto first_to_recover = FirstWalTxToRecover(recovery_data);
wal_files, first_to_recover, [&](const database::StateDelta &delta) {
if (delta.transaction_id >= first_to_recover &&
delta.type == database::StateDelta::Type::TRANSACTION_COMMIT) {
std::vector<tx::TransactionId> committed_tx_ids(committed_set.size());
for (auto id : committed_set) committed_tx_ids.push_back(id);
return committed_tx_ids;
} // anonymous namespace
RecoveryInfo RecoverOnlySnapshot(
const fs::path &durability_dir, database::GraphDb *db,
RecoveryData *recovery_data,
std::optional<tx::TransactionId> required_snapshot_tx_id, int worker_id) {
// Attempt to recover from snapshot files in reverse order (from newest
// backwards).
const auto snapshot_dir = durability_dir / kSnapshotDir;
std::vector<fs::path> snapshot_files;
if (fs::exists(snapshot_dir) && fs::is_directory(snapshot_dir))
for (auto &file : fs::directory_iterator(snapshot_dir))
std::sort(snapshot_files.rbegin(), snapshot_files.rend());
for (auto &snapshot_file : snapshot_files) {
if (required_snapshot_tx_id) {
auto snapshot_file_tx_id =
if (!snapshot_file_tx_id ||
snapshot_file_tx_id.value() != *required_snapshot_tx_id) {
LOG(INFO) << "Skipping snapshot file '" << snapshot_file
<< "' because it does not match the required snapshot tx id: "
<< *required_snapshot_tx_id;
LOG(INFO) << "Starting snapshot recovery from: " << snapshot_file;
if (!RecoverSnapshot(snapshot_file, db, recovery_data, worker_id)) {
LOG(WARNING) << "Snapshot recovery failed, trying older snapshot...";
} else {
LOG(INFO) << "Snapshot recovery successful.";
// If snapshot recovery is required, and we failed, don't even deal with
// the WAL recovery.
if (required_snapshot_tx_id &&
recovery_data->snapshooter_tx_id != *required_snapshot_tx_id)
return {durability::kVersion, recovery_data->snapshooter_tx_id, {}};
return {durability::kVersion, recovery_data->snapshooter_tx_id,
ReadWalRecoverableTransactions(durability_dir / kWalDir, db,
// TODO - finer-grained recovery feedback could be useful here.
void RecoverWal(const fs::path &durability_dir, database::GraphDb *db,
RecoveryData *recovery_data,
RecoveryTransactions *transactions) {
auto wal_dir = durability_dir / kWalDir;
auto wal_files = GetWalFiles(wal_dir);
// Track which transaction should be recovered first, and define logic for
// which transactions should be skipped in recovery.
auto &tx_sn = recovery_data->snapshooter_tx_snapshot;
auto first_to_recover = FirstWalTxToRecover(*recovery_data);
// Set of transactions which can be recovered, since not every transaction in
// wal can be recovered because it might not be present on some workers (there
// wasn't enough time for it to flush to disk or similar)
std::unordered_set<tx::TransactionId> common_wal_tx;
for (auto tx_id : recovery_data->wal_tx_to_recover)
auto should_skip = [&tx_sn, recovery_data, &common_wal_tx,
first_to_recover](tx::TransactionId tx_id) {
return tx_id < first_to_recover ||
(tx_id < recovery_data->snapshooter_tx_id &&
!utils::Contains(tx_sn, tx_id)) ||
!utils::Contains(common_wal_tx, tx_id);
// Ensure that the next transaction ID in the recovered DB will be greater
// than the latest one we have recovered. Do this to make sure that
// subsequently created snapshots and WAL files will have transactional info
// that does not interfere with that found in previous snapshots and WAL.
tx::TransactionId max_observed_tx_id{0};
// Read all the WAL files whose max_tx_id is not smaller than
// min_tx_to_recover.
wal_files, first_to_recover, [&](const database::StateDelta &delta) {
max_observed_tx_id = std::max(max_observed_tx_id, delta.transaction_id);
if (should_skip(delta.transaction_id)) return;
switch (delta.type) {
case database::StateDelta::Type::TRANSACTION_BEGIN:
case database::StateDelta::Type::TRANSACTION_ABORT:
case database::StateDelta::Type::TRANSACTION_COMMIT:
case database::StateDelta::Type::BUILD_INDEX:
// TODO index building might still be problematic in HA
// TODO when implementing proper error handling return one of the following:
// - WAL fully recovered
// - WAL partially recovered
// - WAL recovery error
void RecoverIndexes(
database::GraphDb *db,
const std::vector<std::pair<std::string, std::string>> &indexes) {
auto db_accessor_indices = db->Access();
for (const auto &label_prop : indexes) {
const database::LabelPropertyIndex::Key key{
} // namespace durability

View File

@ -1,132 +0,0 @@
#pragma once
#include <filesystem>
#include <optional>
#include <unordered_map>
#include <vector>
#include "durability/distributed/state_delta.hpp"
#include "durability/hashed_file_reader.hpp"
#include "transactions/type.hpp"
namespace database {
class GraphDb;
namespace durability {
/// Stores info on what was (or needs to be) recovered from durability.
struct RecoveryInfo {
RecoveryInfo() {}
RecoveryInfo(const int64_t durability_version,
tx::TransactionId snapshot_tx_id,
const std::vector<tx::TransactionId> &wal_recovered)
: durability_version(durability_version),
wal_recovered(wal_recovered) {}
int64_t durability_version;
tx::TransactionId snapshot_tx_id;
std::vector<tx::TransactionId> wal_recovered;
bool operator==(const RecoveryInfo &other) const {
return durability_version == other.durability_version &&
snapshot_tx_id == other.snapshot_tx_id &&
wal_recovered == other.wal_recovered;
bool operator!=(const RecoveryInfo &other) const { return !(*this == other); }
// A data structure for exchanging info between main recovery function and
// snapshot and WAL recovery functions.
struct RecoveryData {
tx::TransactionId snapshooter_tx_id{0};
std::vector<tx::TransactionId> wal_tx_to_recover{};
std::vector<tx::TransactionId> snapshooter_tx_snapshot;
// A collection into which the indexes should be added so they
// can be rebuilt at the end of the recovery transaction.
std::vector<std::pair<std::string, std::string>> indexes;
void Clear() {
snapshooter_tx_id = 0;
/** Reads snapshot metadata from the end of the file without messing up the
* hash. */
bool ReadSnapshotSummary(HashedFileReader &buffer, int64_t &vertex_count,
int64_t &edge_count, uint64_t &hash);
* Checks version consistency within the durability directory.
* @param durability_dir - Path to durability directory.
* @return - True if snapshot and WAL versions are compatible with
* ` current memgraph binary.
bool VersionConsistency(const std::filesystem::path &durability_dir);
* Checks whether the current memgraph binary (on a worker) is
* version consistent with the cluster master.
* @param master_version - Version of the master.
* @return - True if versions match.
bool DistributedVersionConsistency(const int64_t master_version);
* Checks whether the durability directory contains snapshot
* or write-ahead log file.
* @param durability_dir - Path to durability directory.
* @return - True if durability directory contains either a snapshot
* or WAL file.
bool ContainsDurabilityFiles(const std::filesystem::path &durabilty_dir);
* Backup snapshots and WAL files to a backup folder.
* @param durability_dir - Path to durability directory.
void MoveToBackup(const std::filesystem::path &durability_dir);
* Recovers database from the latest possible snapshot. If recovering fails,
* false is returned and db_accessor aborts transaction, else true is returned
* and transaction is commited.
* @param durability_dir - Path to durability directory.
* @param db - The database to recover into.
* @param required_snapshot_tx_id - Only used on distributed worker. Indicates
* what the master recovered. The same snapshot must be recovered on the
* worker.
* @return - recovery info
RecoveryInfo RecoverOnlySnapshot(
const std::filesystem::path &durability_dir, database::GraphDb *db,
durability::RecoveryData *recovery_data,
std::optional<tx::TransactionId> required_snapshot_tx_id, int worker_id);
/** Interface for accessing transactions during WAL recovery. */
class RecoveryTransactions {
virtual ~RecoveryTransactions() {}
virtual void Begin(const tx::TransactionId &) = 0;
virtual void Abort(const tx::TransactionId &) = 0;
virtual void Commit(const tx::TransactionId &) = 0;
virtual void Apply(const database::StateDelta &) = 0;
void RecoverWal(const std::filesystem::path &durability_dir,
database::GraphDb *db, RecoveryData *recovery_data,
RecoveryTransactions *transactions);
void RecoverIndexes(
database::GraphDb *db,
const std::vector<std::pair<std::string, std::string>> &indexes);
} // namespace durability

View File

@ -1,34 +0,0 @@
#pragma once
#include "durability/distributed/recovery.hpp"
#include "slk/serialization.hpp"
namespace slk {
inline void Save(const durability::RecoveryInfo &info, slk::Builder *builder) {
slk::Save(info.durability_version, builder);
slk::Save(info.snapshot_tx_id, builder);
slk::Save(info.wal_recovered, builder);
inline void Load(durability::RecoveryInfo *info, slk::Reader *reader) {
slk::Load(&info->durability_version, reader);
slk::Load(&info->snapshot_tx_id, reader);
slk::Load(&info->wal_recovered, reader);
inline void Save(const durability::RecoveryData &data, slk::Builder *builder) {
slk::Save(data.snapshooter_tx_id, builder);
slk::Save(data.wal_tx_to_recover, builder);
slk::Save(data.snapshooter_tx_snapshot, builder);
slk::Save(data.indexes, builder);
inline void Load(durability::RecoveryData *data, slk::Reader *reader) {
slk::Load(&data->snapshooter_tx_id, reader);
slk::Load(&data->wal_tx_to_recover, reader);
slk::Load(&data->snapshooter_tx_snapshot, reader);
slk::Load(&data->indexes, reader);
} // namespace slk

View File

@ -1,142 +0,0 @@
#include "durability/distributed/snapshooter.hpp"
#include <algorithm>
#include <glog/logging.h>
#include "database/distributed/graph_db_accessor.hpp"
#include "durability/distributed/paths.hpp"
#include "durability/distributed/snapshot_encoder.hpp"
#include "durability/distributed/version.hpp"
#include "durability/hashed_file_writer.hpp"
#include "utils/file.hpp"
namespace fs = std::filesystem;
namespace durability {
// Snapshot layout is described in durability/version.hpp
static_assert(durability::kVersion == 6,
"Wrong snapshot version, please update!");
namespace {
bool Encode(const fs::path &snapshot_file, database::GraphDb &db,
database::GraphDbAccessor &dba, int worker_id) {
try {
HashedFileWriter buffer(snapshot_file);
SnapshotEncoder<HashedFileWriter> encoder(buffer);
int64_t vertex_num = 0, edge_num = 0;
// Writes the worker id to snapshot, used to guarantee consistent cluster
// state after recovery
// Write the number of generated vertex and edges, used to recover
// generators internal states
// Write the ID of the transaction doing the snapshot.
// Write the transaction snapshot into the snapshot. It's used when
// recovering from the combination of snapshot and write-ahead-log.
std::vector<communication::bolt::Value> tx_snapshot;
for (int64_t tx : dba.transaction().snapshot())
// Write label+property indexes as list ["label", "property", ...]
std::vector<communication::bolt::Value> index_vec;
for (const auto &key : dba.GetIndicesKeys()) {
for (const auto &vertex : dba.Vertices(false)) {
for (const auto &edge : dba.Edges(false)) {
} catch (const std::ifstream::failure &) {
if (fs::exists(snapshot_file) && !fs::remove(snapshot_file)) {
LOG(ERROR) << "Error while removing corrupted snapshot file: "
<< snapshot_file;
return false;
return true;
// Removes snapshot files so that only `max_retained` latest ones are kept. If
// `max_retained == -1`, all the snapshots are retained.
void RemoveOldSnapshots(const fs::path &snapshot_dir, int max_retained) {
if (max_retained == -1) return;
std::vector<fs::path> files;
for (auto &file : fs::directory_iterator(snapshot_dir))
if (static_cast<int>(files.size()) <= max_retained) return;
sort(files.begin(), files.end());
for (int i = 0; i < static_cast<int>(files.size()) - max_retained; ++i) {
if (!fs::remove(files[i])) {
LOG(ERROR) << "Error while removing file: " << files[i];
// Removes write-ahead log files that are no longer necessary (they don't get
// used when recovering from the latest snapshot.
void RemoveOldWals(const fs::path &wal_dir,
const tx::Transaction &snapshot_transaction) {
if (!fs::exists(wal_dir)) return;
// We can remove all the WAL files that will not be used when restoring from
// the snapshot created in the given transaction.
auto min_trans_id = snapshot_transaction.snapshot().empty()
? snapshot_transaction.id_ + 1
: snapshot_transaction.snapshot().front();
for (auto &wal_file : fs::directory_iterator(wal_dir)) {
auto tx_id = TransactionIdFromWalFilename(wal_file.path().filename());
if (tx_id && tx_id.value() < min_trans_id) {
bool result = fs::remove(wal_file);
DCHECK(result) << "Unable to delete old wal file: " << wal_file;
} // namespace
bool MakeSnapshot(database::GraphDb &db, database::GraphDbAccessor &dba,
int worker_id, const fs::path &durability_dir,
int snapshot_max_retained) {
if (!utils::EnsureDir(durability_dir / kSnapshotDir)) return false;
const auto snapshot_file =
MakeSnapshotPath(durability_dir, worker_id, dba.transaction_id());
if (fs::exists(snapshot_file)) return false;
if (Encode(snapshot_file, db, dba, worker_id)) {
RemoveOldSnapshots(durability_dir / kSnapshotDir, snapshot_max_retained);
RemoveOldWals(durability_dir / kWalDir, dba.transaction());
return true;
} else {
std::error_code error_code; // Just for exception suppression.
fs::remove(snapshot_file, error_code);
return false;
} // namespace durability

View File

@ -1,20 +0,0 @@
#pragma once
#include <filesystem>
#include "database/distributed/graph_db.hpp"
namespace durability {
* Make snapshot and save it in snapshots folder. Returns true if successful.
* @param db - database for which we are creating a snapshot
* @param dba - db accessor with which we are creating a snapshot (reading data)
* @param durability_dir - directory where durability data is stored.
* @param snapshot_max_retained - maximum number of snapshots to retain.
bool MakeSnapshot(database::GraphDb &db, database::GraphDbAccessor &dba,
int worker_id, const std::filesystem::path &durability_dir,
int snapshot_max_retained);
} // namespace durability

View File

@ -1,105 +0,0 @@
#pragma once
#include <optional>
#include "communication/bolt/v1/decoder/decoder.hpp"
#include "durability/distributed/snapshot_value.hpp"
namespace durability {
template <typename Buffer>
class SnapshotDecoder : public communication::bolt::Decoder<Buffer> {
explicit SnapshotDecoder(Buffer &buffer)
: communication::bolt::Decoder<Buffer>(buffer) {}
std::optional<SnapshotVertex> ReadSnapshotVertex() {
communication::bolt::Value dv;
SnapshotVertex vertex;
// Read global id, labels and properties of the vertex
if (!communication::bolt::Decoder<Buffer>::ReadValue(
&dv, communication::bolt::Value::Type::Vertex)) {
DLOG(WARNING) << "Unable to read snapshot vertex";
return std::nullopt;
auto &read_vertex = dv.ValueVertex();
vertex.gid =;
vertex.labels = read_vertex.labels; =;
// Read cypher_id
if (!communication::bolt::Decoder<Buffer>::ReadValue(
&dv, communication::bolt::Value::Type::Int)) {
DLOG(WARNING) << "Unable to read vertex cypher_id";
return std::nullopt;
vertex.cypher_id = dv.ValueInt();
// Read in edges
if (!communication::bolt::Decoder<Buffer>::ReadValue(
&dv, communication::bolt::Value::Type::Int)) {
DLOG(WARNING) << "[ReadSnapshotVertex] Couldn't read number of in "
"edges in vertex!";
return std::nullopt;
for (int i = 0; i < dv.ValueInt(); ++i) {
auto edge = ReadSnapshotEdge();
if (!edge) return std::nullopt;*edge);
// Read out edges
if (!communication::bolt::Decoder<Buffer>::ReadValue(
&dv, communication::bolt::Value::Type::Int)) {
DLOG(WARNING) << "[ReadSnapshotVertex] Couldn't read number of out "
"edges in vertex!";
return std::nullopt;
for (int i = 0; i < dv.ValueInt(); ++i) {
auto edge = ReadSnapshotEdge();
if (!edge) return std::nullopt;
VLOG(20) << "[ReadSnapshotVertex] Success";
return vertex;
std::optional<InlinedVertexEdge> ReadSnapshotEdge() {
communication::bolt::Value dv;
InlinedVertexEdge edge;
VLOG(20) << "[ReadSnapshotEdge] Start";
// Read global id of this edge
if (!communication::bolt::Decoder<Buffer>::ReadValue(
&dv, communication::bolt::Value::Type::Int)) {
DLOG(WARNING) << "[ReadSnapshotEdge] Couldn't read Global ID!";
return std::nullopt;
edge.address = storage::EdgeAddress(static_cast<uint64_t>(dv.ValueInt()));
// Read global vertex id of the other side of the edge
// (global id of from/to vertexes).
if (!communication::bolt::Decoder<Buffer>::ReadValue(
&dv, communication::bolt::Value::Type::Int)) {
DLOG(WARNING) << "[ReadSnapshotEdge] Couldn't read from/to address!";
return std::nullopt;
edge.vertex = storage::VertexAddress(static_cast<uint64_t>(dv.ValueInt()));
// Read edge type
if (!communication::bolt::Decoder<Buffer>::ReadValue(
&dv, communication::bolt::Value::Type::String)) {
DLOG(WARNING) << "[ReadSnapshotEdge] Couldn't read type!";
return std::nullopt;
edge.type = dv.ValueString();
VLOG(20) << "[ReadSnapshotEdge] Success";
return edge;
}; // namespace durability

View File

@ -1,58 +0,0 @@
#pragma once
#include "communication/bolt/v1/encoder/base_encoder.hpp"
#include "database/distributed/graph_db_accessor.hpp"
#include "glue/communication.hpp"
#include "utils/cast.hpp"
namespace durability {
template <typename Buffer>
class SnapshotEncoder : public communication::bolt::BaseEncoder<Buffer> {
explicit SnapshotEncoder(Buffer &buffer)
: communication::bolt::BaseEncoder<Buffer>(buffer) {}
void WriteSnapshotVertex(const VertexAccessor &vertex) {
// Write cypher_id
// Write in edges without properties
auto edges_in =;
for (const auto &edge : edges_in) {
this->WriteSnapshotEdge(edge, true);
// Write out edges without properties
auto edges_out = vertex.out();
for (const auto &edge : edges_out) {
this->WriteSnapshotEdge(edge, false);
void WriteUInt(const uint64_t &value) {
// Writes edge without properties
void WriteSnapshotEdge(const EdgeAccessor &edge, bool write_from) {
// Write global id of the edge
// Write to/from global id
if (write_from)
// Write type
} // namespace durability

View File

@ -1,44 +0,0 @@
#pragma once
#include <map>
#include <string>
#include <vector>
#include "communication/bolt/v1/value.hpp"
#include "storage/common/types/property_value.hpp"
#include "storage/distributed/address_types.hpp"
#include "utils/algorithm.hpp"
#include "utils/exceptions.hpp"
namespace durability {
/** Forward declartion of SnapshotEdge. */
struct InlinedVertexEdge;
* Structure used when reading a Vertex with the decoder.
* The decoder writes data into this structure.
struct SnapshotVertex {
gid::Gid gid;
int64_t cypher_id;
std::vector<std::string> labels;
std::map<std::string, communication::bolt::Value> properties;
// Vector of edges without properties
std::vector<InlinedVertexEdge> in;
std::vector<InlinedVertexEdge> out;
* Structure used when reading an Edge with the snapshot decoder.
* The decoder writes data into this structure.
struct InlinedVertexEdge {
// Addresses down below must always be global_address and never direct
// pointers to a record.
storage::EdgeAddress address;
storage::VertexAddress vertex;
std::string type;
} // namespace durability

View File

@ -1,411 +0,0 @@
#include "durability/distributed/state_delta.hpp"
#include <string>
#include "communication/bolt/v1/value.hpp"
#include "database/distributed/graph_db_accessor.hpp"
#include "glue/communication.hpp"
namespace database {
StateDelta StateDelta::TxBegin(tx::TransactionId tx_id) {
return {StateDelta::Type::TRANSACTION_BEGIN, tx_id};
StateDelta StateDelta::TxCommit(tx::TransactionId tx_id) {
return {StateDelta::Type::TRANSACTION_COMMIT, tx_id};
StateDelta StateDelta::TxAbort(tx::TransactionId tx_id) {
return {StateDelta::Type::TRANSACTION_ABORT, tx_id};
StateDelta StateDelta::CreateVertex(tx::TransactionId tx_id, gid::Gid vertex_id,
int64_t cypher_id) {
StateDelta op(StateDelta::Type::CREATE_VERTEX, tx_id);
op.vertex_id = vertex_id;
op.cypher_id = cypher_id;
return op;
StateDelta StateDelta::CreateEdge(tx::TransactionId tx_id, gid::Gid edge_id,
int64_t cypher_id, gid::Gid vertex_from_id,
gid::Gid vertex_to_id,
storage::EdgeType edge_type,
const std::string &edge_type_name) {
StateDelta op(StateDelta::Type::CREATE_EDGE, tx_id);
op.edge_id = edge_id;
op.cypher_id = cypher_id;
op.vertex_from_id = vertex_from_id;
op.vertex_to_id = vertex_to_id;
op.edge_type = edge_type;
op.edge_type_name = edge_type_name;
return op;
StateDelta StateDelta::AddOutEdge(tx::TransactionId tx_id, gid::Gid vertex_id,
storage::VertexAddress vertex_to_address,
storage::EdgeAddress edge_address,
storage::EdgeType edge_type) {
CHECK(vertex_to_address.is_remote() && edge_address.is_remote())
<< "WAL can only contain global addresses.";
StateDelta op(StateDelta::Type::ADD_OUT_EDGE, tx_id);
op.vertex_id = vertex_id;
op.vertex_to_address = vertex_to_address;
op.edge_address = edge_address;
op.edge_type = edge_type;
return op;
StateDelta StateDelta::RemoveOutEdge(tx::TransactionId tx_id,
gid::Gid vertex_id,
storage::EdgeAddress edge_address) {
CHECK(edge_address.is_remote()) << "WAL can only contain global addresses.";
StateDelta op(StateDelta::Type::REMOVE_OUT_EDGE, tx_id);
op.vertex_id = vertex_id;
op.edge_address = edge_address;
return op;
StateDelta StateDelta::AddInEdge(tx::TransactionId tx_id, gid::Gid vertex_id,
storage::VertexAddress vertex_from_address,
storage::EdgeAddress edge_address,
storage::EdgeType edge_type) {
CHECK(vertex_from_address.is_remote() && edge_address.is_remote())
<< "WAL can only contain global addresses.";
StateDelta op(StateDelta::Type::ADD_IN_EDGE, tx_id);
op.vertex_id = vertex_id;
op.vertex_from_address = vertex_from_address;
op.edge_address = edge_address;
op.edge_type = edge_type;
return op;
StateDelta StateDelta::RemoveInEdge(tx::TransactionId tx_id, gid::Gid vertex_id,
storage::EdgeAddress edge_address) {
CHECK(edge_address.is_remote()) << "WAL can only contain global addresses.";
StateDelta op(StateDelta::Type::REMOVE_IN_EDGE, tx_id);
op.vertex_id = vertex_id;
op.edge_address = edge_address;
return op;
StateDelta StateDelta::PropsSetVertex(tx::TransactionId tx_id,
gid::Gid vertex_id,
storage::Property property,
const std::string &property_name,
const PropertyValue &value) {
StateDelta op(StateDelta::Type::SET_PROPERTY_VERTEX, tx_id);
op.vertex_id = vertex_id; = property;
op.property_name = property_name;
op.value = value;
return op;
StateDelta StateDelta::PropsSetEdge(tx::TransactionId tx_id, gid::Gid edge_id,
storage::Property property,
const std::string &property_name,
const PropertyValue &value) {
StateDelta op(StateDelta::Type::SET_PROPERTY_EDGE, tx_id);
op.edge_id = edge_id; = property;
op.property_name = property_name;
op.value = value;
return op;
StateDelta StateDelta::AddLabel(tx::TransactionId tx_id, gid::Gid vertex_id,
storage::Label label,
const std::string &label_name) {
StateDelta op(StateDelta::Type::ADD_LABEL, tx_id);
op.vertex_id = vertex_id;
op.label = label;
op.label_name = label_name;
return op;
StateDelta StateDelta::RemoveLabel(tx::TransactionId tx_id, gid::Gid vertex_id,
storage::Label label,
const std::string &label_name) {
StateDelta op(StateDelta::Type::REMOVE_LABEL, tx_id);
op.vertex_id = vertex_id;
op.label = label;
op.label_name = label_name;
return op;
StateDelta StateDelta::RemoveVertex(tx::TransactionId tx_id, gid::Gid vertex_id,
bool check_empty) {
StateDelta op(StateDelta::Type::REMOVE_VERTEX, tx_id);
op.vertex_id = vertex_id;
op.check_empty = check_empty;
return op;
StateDelta StateDelta::RemoveEdge(tx::TransactionId tx_id, gid::Gid edge_id) {
StateDelta op(StateDelta::Type::REMOVE_EDGE, tx_id);
op.edge_id = edge_id;
return op;
StateDelta StateDelta::BuildIndex(tx::TransactionId tx_id, storage::Label label,
const std::string &label_name,
storage::Property property,
const std::string &property_name) {
StateDelta op(StateDelta::Type::BUILD_INDEX, tx_id);
op.label = label;
op.label_name = label_name; = property;
op.property_name = property_name;
return op;
void StateDelta::Encode(
HashedFileWriter &writer,
communication::bolt::BaseEncoder<HashedFileWriter> &encoder) const {
switch (type) {
case Type::CREATE_EDGE:
case Type::ADD_OUT_EDGE:
case Type::ADD_IN_EDGE:
case Type::REMOVE_IN_EDGE:
case Type::ADD_LABEL:
case Type::REMOVE_LABEL:
case Type::REMOVE_EDGE:
case Type::BUILD_INDEX:
#define DECODE_MEMBER(member, value_f) \
if (!decoder.ReadValue(&dv)) return nullopt; \
r_val.member = dv.value_f();
#define DECODE_MEMBER_CAST(member, value_f, type) \
if (!decoder.ReadValue(&dv)) return nullopt; \
r_val.member = static_cast<type>(dv.value_f());
std::optional<StateDelta> StateDelta::Decode(
HashedFileReader &reader,
communication::bolt::Decoder<HashedFileReader> &decoder) {
using std::nullopt;
StateDelta r_val;
// The decoded value used as a temporary while decoding.
communication::bolt::Value dv;
try {
if (!decoder.ReadValue(&dv)) return nullopt;
r_val.type = static_cast<enum StateDelta::Type>(dv.ValueInt());
DECODE_MEMBER(transaction_id, ValueInt)
switch (r_val.type) {
DECODE_MEMBER(vertex_id, ValueInt)
DECODE_MEMBER(cypher_id, ValueInt)
case Type::CREATE_EDGE:
DECODE_MEMBER(edge_id, ValueInt)
DECODE_MEMBER(cypher_id, ValueInt)
DECODE_MEMBER(vertex_from_id, ValueInt)
DECODE_MEMBER(vertex_to_id, ValueInt)
DECODE_MEMBER_CAST(edge_type, ValueInt, storage::EdgeType)
DECODE_MEMBER(edge_type_name, ValueString)
case Type::ADD_OUT_EDGE:
DECODE_MEMBER(vertex_id, ValueInt)
DECODE_MEMBER_CAST(vertex_to_address, ValueInt, storage::VertexAddress)
DECODE_MEMBER_CAST(edge_address, ValueInt, storage::EdgeAddress)
DECODE_MEMBER_CAST(edge_type, ValueInt, storage::EdgeType)
DECODE_MEMBER(vertex_id, ValueInt)
DECODE_MEMBER_CAST(edge_address, ValueInt, storage::EdgeAddress)
case Type::ADD_IN_EDGE:
DECODE_MEMBER(vertex_id, ValueInt)
DECODE_MEMBER_CAST(vertex_from_address, ValueInt,
DECODE_MEMBER_CAST(edge_address, ValueInt, storage::EdgeAddress)
DECODE_MEMBER_CAST(edge_type, ValueInt, storage::EdgeType)
case Type::REMOVE_IN_EDGE:
DECODE_MEMBER(vertex_id, ValueInt)
DECODE_MEMBER_CAST(edge_address, ValueInt, storage::EdgeAddress)
DECODE_MEMBER(vertex_id, ValueInt)
DECODE_MEMBER_CAST(property, ValueInt, storage::Property)
DECODE_MEMBER(property_name, ValueString)
if (!decoder.ReadValue(&dv)) return nullopt;
r_val.value = glue::ToPropertyValue(dv);
DECODE_MEMBER(edge_id, ValueInt)
DECODE_MEMBER_CAST(property, ValueInt, storage::Property)
DECODE_MEMBER(property_name, ValueString)
if (!decoder.ReadValue(&dv)) return nullopt;
r_val.value = glue::ToPropertyValue(dv);
case Type::ADD_LABEL:
case Type::REMOVE_LABEL:
DECODE_MEMBER(vertex_id, ValueInt)
DECODE_MEMBER_CAST(label, ValueInt, storage::Label)
DECODE_MEMBER(label_name, ValueString)
DECODE_MEMBER(vertex_id, ValueInt)
case Type::REMOVE_EDGE:
DECODE_MEMBER(edge_id, ValueInt)
case Type::BUILD_INDEX:
DECODE_MEMBER_CAST(label, ValueInt, storage::Label)
DECODE_MEMBER(label_name, ValueString)
DECODE_MEMBER_CAST(property, ValueInt, storage::Property)
DECODE_MEMBER(property_name, ValueString)
auto decoder_hash = reader.hash();
uint64_t encoded_hash;
if (!reader.ReadType(encoded_hash, true)) return nullopt;
if (decoder_hash != encoded_hash) return nullopt;
return r_val;
} catch (communication::bolt::ValueException &) {
return nullopt;
} catch (std::ifstream::failure &) {
return nullopt;
void StateDelta::Apply(GraphDbAccessor &dba) const {
switch (type) {
// Transactional state is not recovered.
LOG(FATAL) << "Transaction handling not handled in Apply";
dba.InsertVertex(vertex_id, cypher_id);
case Type::CREATE_EDGE: {
auto from = dba.FindVertex(vertex_from_id, true);
auto to = dba.FindVertex(vertex_to_id, true);
dba.InsertEdge(from, to, dba.EdgeType(edge_type_name), edge_id,
case Type::ADD_OUT_EDGE:
case Type::ADD_IN_EDGE:
case Type::REMOVE_IN_EDGE:
LOG(FATAL) << "Partial edge creation/deletion not yet supported in Apply";
auto vertex = dba.FindVertex(vertex_id, true);
vertex.PropsSet(dba.Property(property_name), value);
auto edge = dba.FindEdge(edge_id, true);
edge.PropsSet(dba.Property(property_name), value);
case Type::ADD_LABEL: {
auto vertex = dba.FindVertex(vertex_id, true);
case Type::REMOVE_LABEL: {
auto vertex = dba.FindVertex(vertex_id, true);
case Type::REMOVE_VERTEX: {
auto vertex = dba.FindVertex(vertex_id, true);
case Type::REMOVE_EDGE: {
auto edge = dba.FindEdge(edge_id, true);
case Type::BUILD_INDEX: {
LOG(FATAL) << "Index handling not handled in Apply";
}; // namespace database

View File

@ -1,151 +0,0 @@
#pragma once
#include "communication/bolt/v1/decoder/decoder.hpp"
#include "communication/bolt/v1/encoder/base_encoder.hpp"
#include "durability/hashed_file_reader.hpp"
#include "durability/hashed_file_writer.hpp"
#include "storage/common/types/property_value.hpp"
#include "storage/common/types/types.hpp"
#include "storage/distributed/address_types.hpp"
#include "storage/distributed/gid.hpp"
#include "utils/typeinfo.hpp"
(lcp:namespace database)
class GraphDbAccessor;
(lcp:define-struct state-delta ()
;; Members valid for every delta.
(type "Type")
(transaction-id "::tx::TransactionId")
;; Members valid only for some deltas, see StateDelta::Type comments above.
;; TODO: when preparing the WAL for distributed, most likely remove Gids and
;; only keep addresses.
(vertex-id "::gid::Gid")
(edge-id "::gid::Gid")
(cypher-id :int64_t)
(edge-address "::storage::EdgeAddress")
(vertex-from-id "::gid::Gid")
(vertex-from-address "::storage::VertexAddress")
(vertex-to-id "::gid::Gid")
(vertex-to-address "::storage::VertexAddress")
(edge-type "::storage::EdgeType")
(edge-type-name "std::string")
(property "::storage::Property")
(property-name "std::string")
(value "PropertyValue" :initval "PropertyValue::Null")
(label "::storage::Label")
(label-name "std::string")
(check-empty :bool))
"Describes single change to the database state. Used for durability (WAL) and
for distributed remote storage changes.
Labels, Properties and EdgeTypes are stored both as values (integers) and
strings (their names). The values are used when applying deltas in a running
database. Names are used when recovering the database as it's not guaranteed
that after recovery the old name<->value mapping will be preserved.
TODO: ensure the mapping is preserved after recovery and don't save strings
in StateDeltas.")
(lcp:define-enum type
create-vertex ;; vertex_id
create-edge ;; edge_id, from_vertex_id, to_vertex_id, edge_type, edge_type_name
add-out-edge ;; vertex_id, edge_address, vertex_to_address, edge_type
remove-out-edge ;; vertex_id, edge_address
add-in-edge ;; vertex_id, edge_address, vertex_from_address, edge_type
remove-in-edge ;; vertex_id, edge_address
set-property-vertex ;; vertex_id, property, property_name, property_value
set-property-edge ;; edge_id, property, property_name, property_value
;; remove property is done by setting a PropertyValue::Null
add-label ;; vertex_id, label, label_name
remove-label ;; vertex_id, label, label_name
remove-vertex ;; vertex_id, check_empty
remove-edge ;; edge_id
build-index ;; label, label_name, property, property_name
"Defines StateDelta type. For each type the comment indicates which values
need to be stored. All deltas have the transaction_id member, so that's
omitted in the comment.")
StateDelta() = default;
StateDelta(const enum Type &type, tx::TransactionId tx_id)
: type(type), transaction_id(tx_id) {}
/** Attempts to decode a StateDelta from the given decoder. Returns the
* decoded value if successful, otherwise returns nullopt. */
static std::optional<StateDelta> Decode(
HashedFileReader &reader,
communication::bolt::Decoder<HashedFileReader> &decoder);
/** Encodes the delta using primitive encoder, and writes out the new hash
* with delta to the writer */
void Encode(
HashedFileWriter &writer,
communication::bolt::BaseEncoder<HashedFileWriter> &encoder) const;
static StateDelta TxBegin(tx::TransactionId tx_id);
static StateDelta TxCommit(tx::TransactionId tx_id);
static StateDelta TxAbort(tx::TransactionId tx_id);
static StateDelta CreateVertex(tx::TransactionId tx_id,
gid::Gid vertex_id,
int64_t cypher_id);
static StateDelta CreateEdge(tx::TransactionId tx_id, gid::Gid edge_id,
int64_t cypher_id,
gid::Gid vertex_from_id,
gid::Gid vertex_to_id,
storage::EdgeType edge_type,
const std::string &edge_type_name);
static StateDelta AddOutEdge(tx::TransactionId tx_id, gid::Gid vertex_id,
storage::VertexAddress vertex_to_address,
storage::EdgeAddress edge_address,
storage::EdgeType edge_type);
static StateDelta RemoveOutEdge(tx::TransactionId tx_id,
gid::Gid vertex_id,
storage::EdgeAddress edge_address);
static StateDelta AddInEdge(tx::TransactionId tx_id, gid::Gid vertex_id,
storage::VertexAddress vertex_from_address,
storage::EdgeAddress edge_address,
storage::EdgeType edge_type);
static StateDelta RemoveInEdge(tx::TransactionId tx_id, gid::Gid vertex_id,
storage::EdgeAddress edge_address);
static StateDelta PropsSetVertex(tx::TransactionId tx_id,
gid::Gid vertex_id,
storage::Property property,
const std::string &property_name,
const PropertyValue &value);
static StateDelta PropsSetEdge(tx::TransactionId tx_id, gid::Gid edge_id,
storage::Property property,
const std::string &property_name,
const PropertyValue &value);
static StateDelta AddLabel(tx::TransactionId tx_id, gid::Gid vertex_id,
storage::Label label,
const std::string &label_name);
static StateDelta RemoveLabel(tx::TransactionId tx_id, gid::Gid vertex_id,
storage::Label label,
const std::string &label_name);
static StateDelta RemoveVertex(tx::TransactionId tx_id, gid::Gid vertex_id,
bool check_empty);
static StateDelta RemoveEdge(tx::TransactionId tx_id, gid::Gid edge_id);
static StateDelta BuildIndex(tx::TransactionId tx_id, storage::Label label,
const std::string &label_name,
storage::Property property,
const std::string &property_name);
/// Applies CRUD delta to database accessor. Fails on other types of deltas
void Apply(GraphDbAccessor &dba) const;
(:serialize (:slk)))
(lcp:pop-namespace) ;; database

View File

@ -1,52 +0,0 @@
#pragma once
/// IMPORTANT: Please update this file for every snapshot format change!!!
/// TODO (buda): This is not rock solid.
#include <array>
#include <cstdint>
namespace durability {
constexpr std::array<uint8_t, 4> kSnapshotMagic{{'M', 'G', 's', 'n'}};
constexpr std::array<uint8_t, 4> kWalMagic{{'M', 'G', 'w', 'l'}};
// The current default version of snapshot and WAL encoding / decoding.
constexpr int64_t kVersion{6};
// Snapshot format (version 6):
// 1) Magic number + snapshot version
// 2) Distributed worker ID
// The following two entries indicate the starting points for generating new
// vertex/edge IDs in the DB. They are important when there are vertices/edges
// that were moved to another worker (in distributed Memgraph).
// 3) Vertex generator ID
// 4) Edge generator ID
// The following two entries are required when recovering from snapshot combined
// with WAL to determine record visibility.
// 5) Transactional ID of the snapshooter
// 6) Transactional snapshot of the snapshooter
// 7) A list of label+property indices.
// We must inline edges with nodes because some edges might be stored on other
// worker (edges are always stored only on the worker of the edge source).
// 8) Bolt encoded nodes. Each node is written in the following format:
// * gid, labels, properties
// * cypher_id
// * inlined edges (edge address, other endpoint address and edge type)
// 9) Bolt encoded edges. Each edge is written in the following format:
// * gid
// * from, to
// * edge_type
// * properties
// * cypher_id
// 10) Snapshot summary (number of nodes, number of edges, hash)
} // namespace durability

View File

@ -1,161 +0,0 @@
#include "durability/distributed/wal.hpp"
#include "durability/distributed/paths.hpp"
#include "durability/distributed/version.hpp"
#include "utils/file.hpp"
#include "utils/flag_validation.hpp"
wal_flush_interval_millis, 2,
"Interval between two write-ahead log flushes, in milliseconds.");
wal_rotate_deltas_count, 10000,
"How many write-ahead deltas should be stored in a single WAL file "
"before rotating it.");
DEFINE_VALIDATED_HIDDEN_int32(wal_buffer_size, 4096,
"Write-ahead log buffer size.",
FLAG_IN_RANGE(1, 1 << 30));
namespace durability {
WriteAheadLog::WriteAheadLog(int worker_id,
const std::filesystem::path &durability_dir,
bool durability_enabled, bool synchronous_commit)
: deltas_{FLAGS_wal_buffer_size},
wal_file_{worker_id, durability_dir},
synchronous_commit_(synchronous_commit) {
if (durability_enabled_) {
WriteAheadLog::~WriteAheadLog() {
if (durability_enabled_) {
if (!synchronous_commit_) scheduler_.Stop();
WriteAheadLog::WalFile::WalFile(int worker_id,
const std::filesystem::path &durability_dir)
: worker_id_(worker_id), wal_dir_{durability_dir / kWalDir} {}
WriteAheadLog::WalFile::~WalFile() {
if (!current_wal_file_.empty()) writer_.Close();
void WriteAheadLog::WalFile::Init() {
if (!utils::EnsureDir(wal_dir_)) {
LOG(ERROR) << "Can't write to WAL directory: " << wal_dir_;
current_wal_file_ = std::filesystem::path();
} else {
current_wal_file_ = WalFilenameForTransactionId(wal_dir_, worker_id_);
// TODO: Fix error handling, the encoder_ returns `true` or `false`.
try {
} catch (std::ios_base::failure &) {
LOG(ERROR) << "Failed to open write-ahead log file: "
<< current_wal_file_;
current_wal_file_ = std::filesystem::path();
latest_tx_ = 0;
current_wal_file_delta_count_ = 0;
void WriteAheadLog::WalFile::Flush(RingBuffer<database::StateDelta> &buffer) {
std::lock_guard<std::mutex> flush_lock(flush_mutex_);
if (current_wal_file_.empty()) {
LOG(ERROR) << "Write-ahead log file uninitialized, discarding data.";
try {
while (true) {
auto delta = buffer.pop();
if (!delta) break;
latest_tx_ = std::max(latest_tx_, delta->transaction_id);
delta->Encode(writer_, encoder_);
if (++current_wal_file_delta_count_ >= FLAGS_wal_rotate_deltas_count)
} catch (std::ios_base::failure &) {
LOG(ERROR) << "Failed to write to write-ahead log, discarding data.";
} catch (std::filesystem::filesystem_error &) {
LOG(ERROR) << "Failed to rotate write-ahead log.";
void WriteAheadLog::WalFile::RotateFile() {
WalFilenameForTransactionId(wal_dir_, worker_id_, latest_tx_));
void WriteAheadLog::Init() {
if (durability_enabled_) {
enabled_ = true;
if (!synchronous_commit_) {
[this]() { wal_file_.Flush(deltas_); });
void WriteAheadLog::Emplace(const database::StateDelta &delta) {
if (durability_enabled_ && enabled_) {
if (synchronous_commit_ && IsStateDeltaTransactionEnd(delta)) {
bool WriteAheadLog::IsStateDeltaTransactionEnd(
const database::StateDelta &delta) {
switch (delta.type) {
case database::StateDelta::Type::TRANSACTION_COMMIT:
case database::StateDelta::Type::TRANSACTION_ABORT:
return true;
case database::StateDelta::Type::TRANSACTION_BEGIN:
case database::StateDelta::Type::CREATE_VERTEX:
case database::StateDelta::Type::CREATE_EDGE:
case database::StateDelta::Type::ADD_OUT_EDGE:
case database::StateDelta::Type::REMOVE_OUT_EDGE:
case database::StateDelta::Type::ADD_IN_EDGE:
case database::StateDelta::Type::REMOVE_IN_EDGE:
case database::StateDelta::Type::SET_PROPERTY_VERTEX:
case database::StateDelta::Type::SET_PROPERTY_EDGE:
case database::StateDelta::Type::ADD_LABEL:
case database::StateDelta::Type::REMOVE_LABEL:
case database::StateDelta::Type::REMOVE_VERTEX:
case database::StateDelta::Type::REMOVE_EDGE:
case database::StateDelta::Type::BUILD_INDEX:
return false;
void WriteAheadLog::Flush() {
if (enabled_) {
} // namespace durability

View File

@ -1,99 +0,0 @@
#pragma once
#include <chrono>
#include <cstdint>
#include <filesystem>
#include <gflags/gflags.h>
#include <glog/logging.h>
#include "communication/bolt/v1/encoder/base_encoder.hpp"
#include "data_structures/ring_buffer.hpp"
#include "durability/distributed/state_delta.hpp"
#include "storage/common/types/property_value.hpp"
#include "storage/common/types/types.hpp"
#include "storage/distributed/gid.hpp"
#include "transactions/type.hpp"
#include "utils/scheduler.hpp"
namespace durability {
/// A database StateDelta log for durability. Buffers and periodically
/// serializes small-granulation database deltas (StateDelta).
/// The order is not deterministic in a multithreaded scenario (multiple DB
/// transactions). This is fine, the recovery process should be immune to this
/// indeterminism.
class WriteAheadLog {
WriteAheadLog(int worker_id, const std::filesystem::path &durability_dir,
bool durability_enabled, bool synchronous_commit);
/// Initializes the WAL. Called at the end of GraphDb construction, after
/// (optional) recovery. Also responsible for initializing the wal_file.
void Init();
/// Emplaces the given DeltaState onto the buffer, if the WAL is enabled.
/// If the WAL is configured to work in synchronous commit mode, emplace will
/// flush the buffers if a delta represents a transaction end.
void Emplace(const database::StateDelta &delta);
/// Flushes every delta currently in the ring buffer.
/// This method should only be called from tests.
void Flush();
/// Groups the logic of WAL file handling (flushing, naming, rotating)
class WalFile {
WalFile(int worker_id, const std::filesystem::path &wal__dir);
/// Initializes the WAL file. Must be called before first flush. Can be
/// called after Flush() to re-initialize stuff.
void Init();
/// Flushes all the deltas in the buffer to the WAL file. If necessary
/// rotates the file.
void Flush(RingBuffer<database::StateDelta> &buffer);
/// Mutex used for flushing wal data
std::mutex flush_mutex_;
int worker_id_;
const std::filesystem::path wal_dir_;
HashedFileWriter writer_;
communication::bolt::BaseEncoder<HashedFileWriter> encoder_{writer_};
/// The file to which the WAL flushes data. The path is fixed, the file gets
/// moved when the WAL gets rotated.
std::filesystem::path current_wal_file_;
/// Number of deltas in the current wal file.
int current_wal_file_delta_count_{0};
/// The latest transaction whose delta is recorded in the current WAL file.
/// Zero indicates that no deltas have so far been written to the current
/// WAL file.
tx::TransactionId latest_tx_{0};
void RotateFile();
RingBuffer<database::StateDelta> deltas_;
utils::Scheduler scheduler_;
WalFile wal_file_;
/// Used for disabling the durability feature of the DB.
bool durability_enabled_{false};
/// Used for disabling the WAL during DB recovery.
bool enabled_{false};
/// Should every WAL write be synced with the underlying storage.
bool synchronous_commit_{false};
/// Checks whether the given state delta represents a transaction end,
bool IsStateDeltaTransactionEnd(const database::StateDelta &delta);
} // namespace durability

View File

@ -1,174 +0,0 @@
#include <algorithm>
#include <chrono>
#include <cstdint>
#include <exception>
#include <functional>
#include <limits>
#include <thread>
#include <gflags/gflags.h>
#include <glog/logging.h>
#include "communication/server.hpp"
#include "database/distributed/distributed_graph_db.hpp"
#include "integrations/kafka/exceptions.hpp"
#include "integrations/kafka/streams.hpp"
#include "memgraph_init.hpp"
#include "query/distributed/interpreter.hpp"
#include "query/exceptions.hpp"
#include "telemetry/telemetry.hpp"
#include "utils/flag_validation.hpp"
// General purpose flags.
DEFINE_string(interface, "",
"Communication interface on which to listen.");
DEFINE_VALIDATED_int32(port, 7687, "Communication port on which to listen.",
FLAG_IN_RANGE(0, std::numeric_limits<uint16_t>::max()));
std::max(std::thread::hardware_concurrency(), 1U),
"Number of workers (Bolt)", FLAG_IN_RANGE(1, INT32_MAX));
DEFINE_VALIDATED_int32(session_inactivity_timeout, 1800,
"Time in seconds after which inactive sessions will be "
DEFINE_string(cert_file, "", "Certificate file to use.");
DEFINE_string(key_file, "", "Key file to use.");
DEFINE_bool(telemetry_enabled, false,
"Set to true to enable telemetry. We collect information about the "
"running system (CPU and memory information) and information about "
"the database runtime (vertex and edge counts and resource usage) "
"to allow for easier improvement of the product.");
// Audit logging flags.
DEFINE_bool(audit_enabled, false, "Set to true to enable audit logging.");
DEFINE_VALIDATED_int32(audit_buffer_size, audit::kBufferSizeDefault,
"Maximum number of items in the audit log buffer.",
audit_buffer_flush_interval_ms, audit::kBufferFlushIntervalMillisDefault,
"Interval (in milliseconds) used for flushing the audit log buffer.",
using ServerT = communication::Server<BoltSession, SessionData>;
using communication::ServerContext;
// Distributed flags.
master, false,
"If this Memgraph server is the master in a distributed deployment.");
worker, false,
"If this Memgraph server is a worker in a distributed deployment.");
void MasterMain() {
google::SetUsageMessage("Memgraph distributed master");
auto durability_directory = std::filesystem::path(FLAGS_durability_directory);
auth::Auth auth{durability_directory / "auth"};
audit::Log audit_log{durability_directory / "audit", FLAGS_audit_buffer_size,
if (FLAGS_audit_enabled) {
utils::Signal::User2, [&audit_log]() { audit_log.ReopenLog(); }))
<< "Unable to register SIGUSR2 handler!";
database::Master db;
query::DistributedInterpreter interpreter(&db);
SessionData session_data{&db, &interpreter, &auth, &audit_log};
integrations::kafka::Streams kafka_streams{
durability_directory / "streams",
const std::string &query,
const std::map<std::string, communication::bolt::Value> &params) {
KafkaStreamWriter(session_data, query, params);
try {
// Recover possible streams.
} catch (const integrations::kafka::KafkaStreamException &e) {
LOG(ERROR) << e.what();
session_data.interpreter->auth_ = &auth;
session_data.interpreter->kafka_streams_ = &kafka_streams;
ServerContext context;
std::string service_name = "Bolt";
if (FLAGS_key_file != "" && FLAGS_cert_file != "") {
context = ServerContext(FLAGS_key_file, FLAGS_cert_file);
service_name = "BoltS";
ServerT server({FLAGS_interface, static_cast<uint16_t>(FLAGS_port)},
&session_data, &context, FLAGS_session_inactivity_timeout,
service_name, FLAGS_num_workers);
// Handler for regular termination signals
auto shutdown = [&db] {
// We call the shutdown method on the worker database so that we exit
// cleanly.
// Start the database.
// Start the Bolt server.
CHECK(server.Start()) << "Couldn't start the Bolt server!";
// The return code of `AwaitShutdown` is ignored because we want the database
// to exit cleanly no matter what.
db.AwaitShutdown([&server] {
// Server needs to be shutdown first and then the database. This prevents a
// race condition when a transaction is accepted during server shutdown.
void WorkerMain() {
google::SetUsageMessage("Memgraph distributed worker");
database::Worker db;
// Handler for regular termination signals
auto shutdown = [&db] {
// We call the shutdown method on the worker database so that we exit
// cleanly.
// Start the database.
// The return code of `AwaitShutdown` is ignored because we want the database
// to exit cleanly no matter what.
int main(int argc, char **argv) {
auto memgraph_main = [&]() {
CHECK(!(FLAGS_master && FLAGS_worker))
<< "Can't run Memgraph as worker and master at the same time!";
CHECK(FLAGS_master || FLAGS_worker)
<< "You must specify that Memgraph should be either a master or a worker!";
if (FLAGS_master)
return WithInit(argc, argv, memgraph_main);

View File

@ -124,21 +124,12 @@ void KafkaStreamWriter(
for (const auto &kv : params)
params_pv.emplace(kv.first, glue::ToPropertyValue(kv.second));
try {
(*session_data.interpreter)(query, dba, params_pv, false).PullAll(stream);
(*session_data.interpreter)(query, *dba, params_pv, false).PullAll(stream);
} catch (const utils::BasicException &e) {
LOG(WARNING) << "[Kafka] query execution failed with an exception: "
<< e.what();

View File

@ -1,56 +0,0 @@
#pragma once
#include <type_traits>
#include "query/frontend/ast/ast.hpp"
#include "query/distributed/serialization.hpp"
#include "storage/distributed/rpc/serialization.hpp"
(load "query/frontend/ast/ast.lcp")
(lcp:namespace query)
/// Primary function for saving Ast nodes via SLK.
void SaveAstPointer(const Tree *ast, slk::Builder *builder);
Tree *Load(AstStorage *ast, slk::Reader *reader);
/// Primary function for loading Ast nodes via SLK.
template <class TAst>
TAst *LoadAstPointer(AstStorage *ast, slk::Reader *reader) {
static_assert(std::is_base_of<query::Tree, TAst>::value);
bool has_ptr = false;
slk::Load(&has_ptr, reader);
if (!has_ptr) {
return nullptr;
auto *ret = utils::Downcast<TAst>(Load(ast, reader));
if (!ret) {
throw slk::SlkDecodeException("Loading unknown Ast node type");
return ret;
void SaveAstPointer(const Tree *ast, slk::Builder *builder) {
slk::Save(static_cast<bool>(ast), builder);
if (!ast) {
slk::Save(*ast, builder);
Tree *Load(AstStorage *ast, slk::Reader *reader) {
std::unique_ptr<Tree> root;
slk::ConstructAndLoad(&root, reader, ast);
return ast->storage_.back().get();
(lcp:pop-namespace) ;; namespace query

View File

@ -1,10 +0,0 @@
#pragma once
#include "query/frontend/semantic/symbol.hpp"
;; Generate serialization code
;; TODO: This should be merged with query/distributed/serialization
(load "query/frontend/semantic/symbol.lcp")

View File

@ -1,167 +0,0 @@
#include "query/distributed/interpreter.hpp"
#include "database/distributed/distributed_graph_db.hpp"
#include "distributed/plan_dispatcher.hpp"
#include "query/frontend/semantic/symbol_generator.hpp"
#include "query/distributed/plan/planner.hpp"
#include "query/distributed/plan/pretty_print.hpp"
#include "query/plan/planner.hpp"
#include "query/plan/rule_based_planner.hpp"
#include "query/plan/vertex_count_cache.hpp"
namespace query {
namespace {
class DistributedLogicalPlan final : public LogicalPlan {
DistributedLogicalPlan(plan::DistributedPlan plan, double cost,
distributed::PlanDispatcher *plan_dispatcher)
: plan_(std::move(plan)), plan_dispatcher_(plan_dispatcher), cost_(cost) {
for (const auto &plan_pair : plan_.worker_plans) {
const auto &plan_id = plan_pair.first;
const auto &worker_plan = plan_pair.second;
plan_dispatcher_->DispatchPlan(plan_id, worker_plan, plan_.symbol_table);
~DistributedLogicalPlan() {
for (const auto &plan_pair : plan_.worker_plans) {
const auto &plan_id = plan_pair.first;
try {
} catch (const communication::rpc::RpcFailedException &) {
// We ignore RPC exceptions here because the other side can be possibly
// shutting down. TODO: If that is not the case then something is really
// wrong with the cluster!
const plan::LogicalOperator &GetRoot() const override {
return *plan_.master_plan;
double GetCost() const override { return cost_; }
const SymbolTable &GetSymbolTable() const override {
return plan_.symbol_table;
const AstStorage &GetAstStorage() const override {
return plan_.ast_storage;
plan::DistributedPlan plan_;
distributed::PlanDispatcher *plan_dispatcher_{nullptr};
double cost_;
class DistributedPostProcessor final {
// Original plan before rewrite, needed only for temporary cost estimation
// implementation.
std::unique_ptr<plan::LogicalOperator> original_plan_;
std::atomic<int64_t> *next_plan_id_;
Parameters parameters_;
using ProcessedPlan = plan::DistributedPlan;
DistributedPostProcessor(const Parameters &parameters,
std::atomic<int64_t> *next_plan_id)
: next_plan_id_(next_plan_id), parameters_(parameters) {}
template <class TPlanningContext>
plan::DistributedPlan Rewrite(std::unique_ptr<plan::LogicalOperator> plan,
TPlanningContext *context) {
plan::PostProcessor post_processor(parameters_);
original_plan_ = post_processor.Rewrite(std::move(plan), context);
const auto &property_names = context->ast_storage->properties_;
std::vector<storage::Property> properties_by_ix;
for (const auto &name : property_names) {
return MakeDistributedPlan(*context->ast_storage, *original_plan_,
*context->symbol_table, *next_plan_id_,
template <class TVertexCounts>
double EstimatePlanCost(const plan::DistributedPlan &plan,
TVertexCounts *vertex_counts) {
// TODO: Make cost estimation work with distributed plan.
return ::query::plan::EstimatePlanCost(vertex_counts, parameters_,
template <class TPlanningContext>
plan::DistributedPlan MergeWithCombinator(plan::DistributedPlan curr_plan,
plan::DistributedPlan last_plan,
const Tree &combinator,
TPlanningContext *context) {
throw utils::NotYetImplemented("query combinator");
template <class TPlanningContext>
plan::DistributedPlan MakeDistinct(plan::DistributedPlan last_op,
TPlanningContext *context) {
throw utils::NotYetImplemented("query combinator");
} // namespace
DistributedInterpreter::DistributedInterpreter(database::Master *db)
: plan_dispatcher_(&db->plan_dispatcher()) {}
std::unique_ptr<LogicalPlan> DistributedInterpreter::MakeLogicalPlan(
CypherQuery *query, AstStorage ast_storage, const Parameters &parameters,
database::GraphDbAccessor *db_accessor) {
auto vertex_counts = plan::MakeVertexCountCache(db_accessor);
auto symbol_table = MakeSymbolTable(query);
auto planning_context = plan::MakePlanningContext(&ast_storage, &symbol_table,
query, &vertex_counts);
DistributedPostProcessor distributed_post_processor(parameters,
plan::DistributedPlan plan;
double cost;
std::tie(plan, cost) = plan::MakeLogicalPlan(
&planning_context, &distributed_post_processor, FLAGS_query_cost_planner);
VLOG(10) << "[Interpreter] Created plan for distributed execution "
<< next_plan_id_ - 1;
return std::make_unique<DistributedLogicalPlan>(std::move(plan), cost,
Interpreter::Results DistributedInterpreter::operator()(
const std::string &query_string, database::GraphDbAccessor &db_accessor,
const std::map<std::string, PropertyValue> &params,
bool in_explicit_transaction) {
AstStorage ast_storage;
Parameters parameters;
auto queries = StripAndParseQuery(query_string, &parameters, &ast_storage,
&db_accessor, params);
ParsedQuery &parsed_query = queries.second;
if (utils::IsSubtype(*parsed_query.query, ProfileQuery::kType)) {
throw utils::NotYetImplemented("PROFILE in a distributed query");
return Interpreter::operator()(query_string, db_accessor, params,
void DistributedInterpreter::PrettyPrintPlan(
const database::GraphDbAccessor &dba,
const plan::LogicalOperator *plan_root, std::ostream *out) {
plan::DistributedPrettyPrint(dba, plan_root, out);
std::string DistributedInterpreter::PlanToJson(
const database::GraphDbAccessor &dba,
const plan::LogicalOperator *plan_root) {
return plan::DistributedPlanToJson(dba, plan_root).dump();
} // namespace query

View File

@ -1,38 +0,0 @@
#pragma once
#include "query/interpreter.hpp"
namespace database {
class Master;
namespace distributed {
class PlanDispatcher;
namespace query {
class DistributedInterpreter final : public Interpreter {
DistributedInterpreter(database::Master *db);
Results operator()(const std::string &, database::GraphDbAccessor &,
const std::map<std::string, PropertyValue> &,
bool in_explicit_transaction) override;
std::unique_ptr<LogicalPlan> MakeLogicalPlan(
CypherQuery *, AstStorage, const Parameters &,
database::GraphDbAccessor *) override;
void PrettyPrintPlan(const database::GraphDbAccessor &,
const plan::LogicalOperator *, std::ostream *) override;
std::string PlanToJson(const database::GraphDbAccessor &,
const plan::LogicalOperator *) override;
std::atomic<int64_t> next_plan_id_{0};
distributed::PlanDispatcher *plan_dispatcher_{nullptr};
} // namespace query

File diff suppressed because it is too large Load Diff

View File

@ -1,357 +0,0 @@
/// @file
#pragma once
#include "query/distributed/frontend/ast/ast_serialization.hpp"
#include "query/plan/operator.hpp"
#include "query/distributed/serialization.hpp"
#include "storage/distributed/rpc/serialization.hpp"
(load "query/plan/operator.lcp")
(lcp:namespace query)
(lcp:namespace plan)
class PullRemote;
class Synchronize;
class PullRemoteOrderBy;
class DistributedExpand;
class DistributedExpandBfs;
class DistributedCreateNode;
class DistributedCreateExpand;
using DistributedOperatorCompositeVisitor =
::utils::CompositeVisitor<PullRemote, Synchronize, PullRemoteOrderBy,
DistributedExpand, DistributedExpandBfs,
DistributedCreateNode, DistributedCreateExpand>;
/// Base class for visiting regular and distributed LogicalOperator instances.
/// HierarchicalLogicalOperatorVisitor is inherited virtually, so that potential
/// multiple inheritance of DistributedOperatorVisitor and other
/// HierarchicalLogicalOperatorVisitor derived types is possible. Note that
/// virtual inheritance resolves the diamond problem, but this still carries a
/// cost. For example, you can no longer use static_cast to downcast a type even
/// though you are 100% sure downcast would be correct. dynamic_cast should work
/// as usual.
class DistributedOperatorVisitor : public virtual HierarchicalLogicalOperatorVisitor,
public DistributedOperatorCompositeVisitor {
using DistributedOperatorCompositeVisitor::PostVisit;
using DistributedOperatorCompositeVisitor::PreVisit;
using HierarchicalLogicalOperatorVisitor::PostVisit;
using HierarchicalLogicalOperatorVisitor::PreVisit;
using HierarchicalLogicalOperatorVisitor::Visit;
(lcp:define-class pull-remote (logical-operator)
((input "std::shared_ptr<LogicalOperator>" :scope :public
:slk-save #'slk-save-operator-pointer
:slk-load #'slk-load-operator-pointer)
(plan-id :int64_t :initval 0 :scope :public)
(symbols "std::vector<Symbol>" :scope :public))
"An operator in distributed Memgraph that yields both local and remote (from
other workers) frames. Obtaining remote frames is done through RPC calls to
`distributed::ProduceRpcServer`s running on all the workers.
This operator aims to yield results as fast as possible and lose minimal
time on data transfer. It gives no guarantees on result order.")
PullRemote() {}
PullRemote(const std::shared_ptr<LogicalOperator> &input, int64_t plan_id,
const std::vector<Symbol> &symbols)
: input_(input), plan_id_(plan_id), symbols_(symbols) {}
bool Accept(HierarchicalLogicalOperatorVisitor &visitor) override;
UniqueCursorPtr MakeCursor(utils::MemoryResource *) const override;
std::vector<Symbol> OutputSymbols(const SymbolTable &) const override;
std::vector<Symbol> ModifiedSymbols(const SymbolTable &) const override;
bool HasSingleInput() const override { return true; }
std::shared_ptr<LogicalOperator> input() const override { return input_; }
void set_input(std::shared_ptr<LogicalOperator> input) override {
input_ = input;
(:serialize (:slk))
(defun slk-load-pull-remote (member)
std::shared_ptr<query::plan::LogicalOperator> op;
&op, reader, &helper->loaded_ops,
[&helper](auto *op, auto *reader) {
slk::ConstructAndLoad(op, reader, helper);
self->${member} = std::static_pointer_cast<query::plan::PullRemote>(op);
(lcp:define-class synchronize (logical-operator)
((input "std::shared_ptr<LogicalOperator>" :scope :public
:slk-save #'slk-save-operator-pointer
:slk-load #'slk-load-operator-pointer)
(pull-remote "std::shared_ptr<PullRemote>" :scope :public
:slk-save #'slk-save-operator-pointer
:slk-load #'slk-load-pull-remote
:clone (lambda (source dest)
if (${source}) {
std::shared_ptr<LogicalOperator> tmp = ${source}->Clone(storage);
${dest} = std::static_pointer_cast<PullRemote>(tmp);
} else {
${dest} = nullptr;
(advance-command :bool :initval "false" :scope :public))
"Operator used to synchronize stages of plan execution between the master and
all the workers. Synchronization is necessary in queries that update that
graph state because updates (as well as creations and deletions) are deferred
to avoid multithreaded modification of graph element data (as it's not
Logic of the synchronize operator is:
1. If there is a Pull, tell all the workers to pull on that plan and
accumulate results without sending them to the master. This is async.
2. Accumulate local results, in parallel with 1. getting executed on workers.
3. Wait till the master and all the workers are done accumulating.
4. Advance the command, if necessary.
5. Tell all the workers to apply their updates. This is async.
6. Apply local updates, in parallel with 5. on the workers.
7. Notify workers that the command has advanced, if necessary.
8. Yield all the results, first local, then from Pull if available.")
Synchronize() {}
Synchronize(const std::shared_ptr<LogicalOperator> &input,
const std::shared_ptr<PullRemote> &pull_remote,
bool advance_command)
: input_(input),
advance_command_(advance_command) {}
bool Accept(HierarchicalLogicalOperatorVisitor &visitor) override;
UniqueCursorPtr MakeCursor(utils::MemoryResource *) const override;
std::vector<Symbol> ModifiedSymbols(const SymbolTable &) const override;
std::vector<Symbol> OutputSymbols(
const SymbolTable &symbol_table) const override {
return input_->OutputSymbols(symbol_table);
bool HasSingleInput() const override { return true; }
std::shared_ptr<LogicalOperator> input() const override { return input_; }
void set_input(std::shared_ptr<LogicalOperator> input) override {
input_ = input;
(:serialize (:slk))
(lcp:define-class pull-remote-order-by (logical-operator)
((input "std::shared_ptr<LogicalOperator>" :scope :public
:slk-save #'slk-save-operator-pointer
:slk-load #'slk-load-operator-pointer)
(plan-id :int64_t :initval 0 :scope :public)
(symbols "std::vector<Symbol>" :scope :public)
(order-by "std::vector<Expression *>" :scope :public
:slk-save #'slk-save-ast-vector
:slk-load (slk-load-ast-vector "Expression"))
(compare "TypedValueVectorCompare" :scope :public))
"Operator that merges distributed OrderBy operators.
Instead of using a regular OrderBy on master (which would collect all remote
results and order them), we can have each worker do an OrderBy locally and
have the master rely on the fact that the results are ordered and merge them
by having only one result from each worker.")
PullRemoteOrderBy() {}
const std::shared_ptr<LogicalOperator> &input, int64_t plan_id,
const std::vector<SortItem> &order_by, const std::vector<Symbol> &symbols);
bool Accept(HierarchicalLogicalOperatorVisitor &visitor) override;
UniqueCursorPtr MakeCursor(utils::MemoryResource *) const override;
std::vector<Symbol> ModifiedSymbols(const SymbolTable &) const override;
std::vector<Symbol> OutputSymbols(const SymbolTable &) const override;
bool HasSingleInput() const override { return true; }
std::shared_ptr<LogicalOperator> input() const override { return input_; }
void set_input(std::shared_ptr<LogicalOperator> input) override {
input_ = input;
(:serialize (:slk))
(lcp:define-class distributed-expand (logical-operator)
((input "std::shared_ptr<LogicalOperator>" :scope :public
:slk-save #'slk-save-operator-pointer
:slk-load #'slk-load-operator-pointer)
(input-symbol "Symbol" :scope :public)
(common "ExpandCommon" :scope :public)
(graph-view "GraphView" :scope :public
"State from which the input node should get expanded."))
(:documentation "Distributed version of Expand operator")
DistributedExpand() {}
DistributedExpand(const std::shared_ptr<LogicalOperator> &input,
Symbol input_symbol, Symbol node_symbol, Symbol edge_symbol,
EdgeAtom::Direction direction,
const std::vector<storage::EdgeType> &edge_types,
bool existing_node, GraphView graph_view);
DistributedExpand(const std::shared_ptr<LogicalOperator> &input,
Symbol input_symbol, const ExpandCommon &common);
bool Accept(HierarchicalLogicalOperatorVisitor &visitor) override;
UniqueCursorPtr MakeCursor(utils::MemoryResource *) const override;
std::vector<Symbol> ModifiedSymbols(const SymbolTable &) const override;
bool HasSingleInput() const override { return true; }
std::shared_ptr<LogicalOperator> input() const override { return input_; }
void set_input(std::shared_ptr<LogicalOperator> input) override {
input_ = input;
(:serialize (:slk))
(lcp:define-class distributed-expand-bfs (logical-operator)
((input "std::shared_ptr<LogicalOperator>" :scope :public
:slk-save #'slk-save-operator-pointer
:slk-load #'slk-load-operator-pointer)
(input-symbol "Symbol" :scope :public)
(common "ExpandCommon" :scope :public)
(lower-bound "Expression *" :scope :public
:documentation "Optional lower bound, default is 1"
:slk-save #'slk-save-ast-pointer
:slk-load (slk-load-ast-pointer "Expression"))
(upper-bound "Expression *" :scope :public
:documentation "Optional upper bound, default is infinity"
:slk-save #'slk-save-ast-pointer
:slk-load (slk-load-ast-pointer "Expression"))
(filter-lambda "ExpansionLambda" :scope :public
:documentation "Filter that must be satisfied for expansion to succeed."
:slk-load (lambda (member)
slk::Load(&self->${member}, reader, &helper->ast_storage);
(:documentation "BFS expansion operator suited for distributed execution.")
DistributedExpandBfs() {}
DistributedExpandBfs(const std::shared_ptr<LogicalOperator> &input,
Symbol input_symbol, Symbol node_symbol,
Symbol edge_symbol, EdgeAtom::Direction direction,
const std::vector<storage::EdgeType> &edge_types,
bool existing_node, Expression *lower_bound,
Expression *upper_bound,
const ExpansionLambda &filter_lambda);
DistributedExpandBfs(const std::shared_ptr<LogicalOperator> &input,
Symbol input_symbol, const ExpandCommon &common,
Expression *lower_bound, Expression *upper_bound,
const ExpansionLambda &filter_lambda);
bool Accept(HierarchicalLogicalOperatorVisitor &visitor) override;
UniqueCursorPtr MakeCursor(utils::MemoryResource *) const override;
std::vector<Symbol> ModifiedSymbols(const SymbolTable &) const override;
bool HasSingleInput() const override { return true; }
std::shared_ptr<LogicalOperator> input() const override { return input_; }
void set_input(std::shared_ptr<LogicalOperator> input) override {
input_ = input;
(:serialize (:slk))
(lcp:define-class distributed-create-node (logical-operator)
((input "std::shared_ptr<LogicalOperator>" :scope :public
:slk-save #'slk-save-operator-pointer
:slk-load #'slk-load-operator-pointer)
(node-info "NodeCreationInfo" :scope :public
:slk-save (lambda (m)
slk::Save(self.${m}, builder, helper);
:slk-load (lambda (m)
slk::Load(&self->${m}, reader, helper);
(on-random-worker :bool :initval "false" :scope :public))
(:documentation "Create nodes in distributed environment.")
DistributedCreateNode() {}
DistributedCreateNode(const std::shared_ptr<LogicalOperator> &input,
const NodeCreationInfo &node_info, bool on_random_worker);
bool Accept(HierarchicalLogicalOperatorVisitor &visitor) override;
UniqueCursorPtr MakeCursor(utils::MemoryResource *) const override;
std::vector<Symbol> ModifiedSymbols(const SymbolTable &) const override;
bool HasSingleInput() const override { return true; }
std::shared_ptr<LogicalOperator> input() const override { return input_; }
void set_input(std::shared_ptr<LogicalOperator> input) override {
input_ = input;
(:serialize (:slk))
(lcp:define-class distributed-create-expand (logical-operator)
((node-info "NodeCreationInfo" :scope :public
:slk-save (lambda (m)
slk::Save(self.${m}, builder, helper);
:slk-load (lambda (m)
slk::Load(&self->${m}, reader, helper);
(edge-info "EdgeCreationInfo" :scope :public
:slk-save (lambda (m)
slk::Save(self.${m}, builder, helper);
:slk-load (lambda (m)
slk::Load(&self->${m}, reader, helper);
(input "std::shared_ptr<LogicalOperator>" :scope :public
:slk-save #'slk-save-operator-pointer
:slk-load #'slk-load-operator-pointer)
(input-symbol "Symbol" :scope :public)
(existing-node :bool :scope :public))
(:documentation "Distributed version of CreateExpand")
DistributedCreateExpand() {}
DistributedCreateExpand(const NodeCreationInfo &node_info,
const EdgeCreationInfo &edge_info,
const std::shared_ptr<LogicalOperator> &input,
Symbol input_symbol, bool existing_node);
bool Accept(HierarchicalLogicalOperatorVisitor &visitor) override;
UniqueCursorPtr MakeCursor(utils::MemoryResource *) const override;
std::vector<Symbol> ModifiedSymbols(const SymbolTable &) const override;
bool HasSingleInput() const override { return true; }
std::shared_ptr<LogicalOperator> input() const override { return input_; }
void set_input(std::shared_ptr<LogicalOperator> input) override {
input_ = input;
(:serialize (:slk))
(lcp:pop-namespace) ;; plan
(lcp:pop-namespace) ;; query

File diff suppressed because it is too large Load Diff

View File

@ -1,31 +0,0 @@
/// @file
#pragma once
#include <memory>
#include "query/frontend/semantic/symbol_table.hpp"
#include "query/plan/operator.hpp"
namespace query::plan {
/// Complete plan split into master/worker parts.
struct DistributedPlan {
int64_t master_plan_id;
/// Plan to be executed on the master server.
std::unique_ptr<LogicalOperator> master_plan;
/// Pairs of {plan_id, plan} for execution on each worker.
std::vector<std::pair<int64_t, std::shared_ptr<LogicalOperator>>>
/// Ast storage with newly added expressions.
AstStorage ast_storage;
/// Symbol table with newly added symbols.
SymbolTable symbol_table;
/// Creates a `DistributedPlan` from a regular plan.
DistributedPlan MakeDistributedPlan(
const AstStorage &ast_storage, const LogicalOperator &plan,
const SymbolTable &symbol_table, std::atomic<int64_t> &next_plan_id,
const std::vector<storage::Property> &properties_by_ix);
} // namespace query::plan

View File

@ -1,249 +0,0 @@
#include "query/distributed/plan/pretty_print.hpp"
namespace query::plan {
bool DistributedPlanPrinter::PreVisit(query::plan::DistributedExpand &op) {
WithPrintLn([&](auto &out) {
out << "* DistributedExpand (" << << ")"
<< (op.common_.direction == query::EdgeAtom::Direction::IN ? "<-" : "-")
<< "[" <<;
utils::PrintIterable(out, op.common_.edge_types, "|",
[this](auto &stream, const auto &edge_type) {
stream << ":" << dba_->EdgeTypeName(edge_type);
out << "]"
<< (op.common_.direction == query::EdgeAtom::Direction::OUT ? "->"
: "-")
<< "(" << << ")";
return true;
bool DistributedPlanPrinter::PreVisit(query::plan::DistributedExpandBfs &op) {
WithPrintLn([&](auto &out) {
out << "* DistributedExpandBfs (" << << ")"
<< (op.common_.direction == query::EdgeAtom::Direction::IN ? "<-" : "-")
<< "[" <<;
utils::PrintIterable(out, op.common_.edge_types, "|",
[this](auto &stream, const auto &edge_type) {
stream << ":" << dba_->EdgeTypeName(edge_type);
out << "]"
<< (op.common_.direction == query::EdgeAtom::Direction::OUT ? "->"
: "-")
<< "(" << << ")";
return true;
bool DistributedPlanPrinter::PreVisit(query::plan::PullRemote &op) {
WithPrintLn([&op](auto &out) {
out << "* PullRemote [" << op.plan_id_ << "] {";
utils::PrintIterable(out, op.symbols_, ", ",
[](auto &out, const auto &sym) { out <<; });
out << "}";
WithPrintLn([](auto &out) { out << "|\\"; });
WithPrintLn([](auto &out) { out << "* workers"; });
return true;
bool DistributedPlanPrinter::PreVisit(query::plan::PullRemoteOrderBy &op) {
WithPrintLn([&op](auto &out) {
out << "* PullRemoteOrderBy {";
utils::PrintIterable(out, op.symbols_, ", ",
[](auto &out, const auto &sym) { out <<; });
out << "}";
WithPrintLn([](auto &out) { out << "|\\"; });
WithPrintLn([](auto &out) { out << "* workers"; });
return true;
#define PRE_VISIT(TOp) \
bool DistributedPlanPrinter::PreVisit(TOp &) { \
WithPrintLn([](auto &out) { out << "* " << #TOp; }); \
return true; \
bool DistributedPlanPrinter::PreVisit(DistributedCreateExpand &op) {
WithPrintLn([&](auto &out) {
out << "* DistributedCreateExpand (" << << ")"
<< (op.edge_info_.direction == query::EdgeAtom::Direction::IN ? "<-"
: "-")
<< "[" << << ":"
<< dba_->EdgeTypeName(op.edge_info_.edge_type) << "]"
<< (op.edge_info_.direction == query::EdgeAtom::Direction::OUT ? "->"
: "-")
<< "(" << << ")";
return true;
#undef PRE_VISIT
bool DistributedPlanPrinter::PreVisit(query::plan::Synchronize &op) {
WithPrintLn([&op](auto &out) {
out << "* Synchronize";
if (op.advance_command_) out << " (ADV CMD)";
if (op.pull_remote_) Branch(*op.pull_remote_);
return false;
void DistributedPrettyPrint(const database::GraphDbAccessor &dba,
const LogicalOperator *plan_root,
std::ostream *out) {
DistributedPlanPrinter printer(&dba, out);
// FIXME(mtomic): We should make visitors that take const argument.
const_cast<LogicalOperator *>(plan_root)->Accept(printer);
nlohmann::json DistributedPlanToJson(const database::GraphDbAccessor &dba,
const LogicalOperator *plan_root) {
impl::DistributedPlanToJsonVisitor visitor(&dba);
const_cast<LogicalOperator *>(plan_root)->Accept(visitor);
return visitor.output();
namespace impl {
// DistributedPlanToJsonVisitor implementation
// The JSON formatted plan is consumed (or will be) by Memgraph Lab, and
// therefore should not be changed before synchronizing with whoever is
// maintaining Memgraph Lab. Hopefully, one day integration tests will exist and
// there will be no need to be super careful.
using json = nlohmann::json;
bool DistributedPlanToJsonVisitor::PreVisit(DistributedExpand &op) {
json self;
self["name"] = "DistributedExpand";
self["input_symbol"] = ToJson(op.input_symbol_);
self["node_symbol"] = ToJson(op.common_.node_symbol);
self["edge_symbol"] = ToJson(op.common_.edge_symbol);
self["edge_types"] = ToJson(op.common_.edge_types, *dba_);
self["direction"] = ToString(op.common_.direction);
self["existing_node"] = op.common_.existing_node;
self["input"] = PopOutput();
output_ = std::move(self);
return false;
bool DistributedPlanToJsonVisitor::PreVisit(DistributedExpandBfs &op) {
json self;
self["name"] = "DistributedExpandBfs";
self["input_symbol"] = ToJson(op.input_symbol_);
self["node_symbol"] = ToJson(op.common_.node_symbol);
self["edge_symbol"] = ToJson(op.common_.edge_symbol);
self["edge_types"] = ToJson(op.common_.edge_types, *dba_);
self["direction"] = ToString(op.common_.direction);
self["lower_bound"] = op.lower_bound_ ? ToJson(op.lower_bound_) : json();
self["upper_bound"] = op.upper_bound_ ? ToJson(op.upper_bound_) : json();
self["existing_node"] = op.common_.existing_node;
self["filter_lambda"] = op.filter_lambda_.expression
? ToJson(op.filter_lambda_.expression)
: json();
self["input"] = PopOutput();
output_ = std::move(self);
return false;
bool DistributedPlanToJsonVisitor::PreVisit(PullRemote &op) {
json self;
self["name"] = "PullRemote";
self["symbols"] = ToJson(op.symbols_);
self["input"] = PopOutput();
output_ = std::move(self);
return false;
bool DistributedPlanToJsonVisitor::PreVisit(PullRemoteOrderBy &op) {
json self;
self["name"] = "PullRemoteOrderBy";
for (auto i = 0; i < op.order_by_.size(); ++i) {
json json;
json["ordering"] = ToString(op.compare_.ordering_[i]);
json["expression"] = ToJson(op.order_by_[i]);
self["symbols"] = ToJson(op.symbols_);
self["input"] = PopOutput();
output_ = std::move(self);
return false;
bool DistributedPlanToJsonVisitor::PreVisit(DistributedCreateNode &op) {
json self;
self["name"] = "DistributedCreateNode";
self["node_info"] = ToJson(op.node_info_, *dba_);
self["on_random_worker"] = op.on_random_worker_;
self["input"] = PopOutput();
output_ = std::move(self);
return false;
bool DistributedPlanToJsonVisitor::PreVisit(DistributedCreateExpand &op) {
json self;
self["name"] = "DistributedCreateExpand";
self["input_symbol"] = ToJson(op.input_symbol_);
self["node_info"] = ToJson(op.node_info_, *dba_);
self["edge_info"] = ToJson(op.edge_info_, *dba_);
self["existing_node"] = op.existing_node_;
self["input"] = PopOutput();
output_ = std::move(self);
return false;
bool DistributedPlanToJsonVisitor::PreVisit(Synchronize &op) {
json self;
self["name"] = "Synchronize";
self["advance_command"] = op.advance_command_;
self["input"] = PopOutput();
if (op.pull_remote_) {
self["pull_remote"] = PopOutput();
} else {
self["pull_remote"] = json();
output_ = std::move(self);
return false;
} // namespace impl
} // namespace query::plan

View File

@ -1,71 +0,0 @@
/// @file
#pragma once
#include "query/distributed/plan/ops.hpp"
#include "query/plan/pretty_print.hpp"
#include <json/json.hpp>
namespace query::plan {
void DistributedPrettyPrint(const database::GraphDbAccessor &dba,
const LogicalOperator *plan_root,
std::ostream *out);
inline void DistributedPrettyPrint(const database::GraphDbAccessor &dba,
const LogicalOperator *plan_root) {
DistributedPrettyPrint(dba, plan_root, &std::cout);
nlohmann::json DistributedPlanToJson(const database::GraphDbAccessor &dba,
const LogicalOperator *plan_root);
class DistributedPlanPrinter : public PlanPrinter,
public DistributedOperatorVisitor {
using DistributedOperatorVisitor::PostVisit;
using DistributedOperatorVisitor::PreVisit;
using DistributedOperatorVisitor::Visit;
using PlanPrinter::PlanPrinter;
using PlanPrinter::PostVisit;
using PlanPrinter::PreVisit;
using PlanPrinter::Visit;
bool PreVisit(DistributedExpand &) override;
bool PreVisit(DistributedExpandBfs &) override;
bool PreVisit(PullRemote &) override;
bool PreVisit(PullRemoteOrderBy &) override;
bool PreVisit(DistributedCreateNode &) override;
bool PreVisit(DistributedCreateExpand &) override;
bool PreVisit(Synchronize &) override;
namespace impl {
class DistributedPlanToJsonVisitor : public PlanToJsonVisitor,
public DistributedOperatorVisitor {
using DistributedOperatorVisitor::PostVisit;
using DistributedOperatorVisitor::PreVisit;
using DistributedOperatorVisitor::Visit;
using PlanToJsonVisitor::PlanToJsonVisitor;
using PlanToJsonVisitor::PostVisit;
using PlanToJsonVisitor::PreVisit;
using PlanToJsonVisitor::Visit;
bool PreVisit(DistributedExpand &) override;
bool PreVisit(DistributedExpandBfs &) override;
bool PreVisit(PullRemote &) override;
bool PreVisit(PullRemoteOrderBy &) override;
bool PreVisit(DistributedCreateNode &) override;
bool PreVisit(DistributedCreateExpand &) override;
bool PreVisit(Synchronize &) override;
} // namespace impl
} // namespace query::plan

View File

@ -1,224 +0,0 @@
#include "query/distributed/serialization.hpp"
#include "distributed/data_manager.hpp"
#include "query/distributed/frontend/ast/ast_serialization.hpp"
namespace slk {
void Save(const query::TypedValue &value, slk::Builder *builder,
storage::SendVersions versions, int16_t worker_id) {
switch (value.type()) {
case query::TypedValue::Type::Null:
slk::Save(static_cast<uint8_t>(0), builder);
case query::TypedValue::Type::Bool:
slk::Save(static_cast<uint8_t>(1), builder);
slk::Save(value.Value<bool>(), builder);
case query::TypedValue::Type::Int:
slk::Save(static_cast<uint8_t>(2), builder);
slk::Save(value.Value<int64_t>(), builder);
case query::TypedValue::Type::Double:
slk::Save(static_cast<uint8_t>(3), builder);
slk::Save(value.Value<double>(), builder);
case query::TypedValue::Type::String:
slk::Save(static_cast<uint8_t>(4), builder);
slk::Save(std::string(value.ValueString()), builder);
case query::TypedValue::Type::List: {
slk::Save(static_cast<uint8_t>(5), builder);
const auto &values = value.ValueList();
size_t size = values.size();
slk::Save(size, builder);
for (const auto &v : values) {
slk::Save(v, builder, versions, worker_id);
case query::TypedValue::Type::Map: {
slk::Save(static_cast<uint8_t>(6), builder);
const auto &map = value.ValueMap();
size_t size = map.size();
slk::Save(size, builder);
for (const auto &kv : map) {
slk::Save(std::string(kv.first), builder);
slk::Save(kv.second, builder, versions, worker_id);
case query::TypedValue::Type::Vertex: {
slk::Save(static_cast<uint8_t>(7), builder);
slk::Save(value.ValueVertex(), builder, versions, worker_id);
case query::TypedValue::Type::Edge: {
slk::Save(static_cast<uint8_t>(8), builder);
slk::Save(value.ValueEdge(), builder, versions, worker_id);
case query::TypedValue::Type::Path: {
slk::Save(static_cast<uint8_t>(9), builder);
const auto &path = value.ValuePath();
size_t v_size = path.vertices().size();
slk::Save(v_size, builder);
for (const auto &v : path.vertices()) {
slk::Save(v, builder, versions, worker_id);
size_t e_size = path.edges().size();
slk::Save(e_size, builder);
for (const auto &e : path.edges()) {
slk::Save(e, builder, versions, worker_id);
void Load(query::TypedValue *value, slk::Reader *reader,
database::GraphDbAccessor *dba,
distributed::DataManager *data_manager) {
uint8_t type;
slk::Load(&type, reader);
switch (type) {
case static_cast<uint8_t>(0):
*value = query::TypedValue();
case static_cast<uint8_t>(1): {
bool v;
slk::Load(&v, reader);
*value = v;
case static_cast<uint8_t>(2): {
int64_t v;
slk::Load(&v, reader);
*value = v;
case static_cast<uint8_t>(3): {
double v;
slk::Load(&v, reader);
*value = v;
case static_cast<uint8_t>(4): {
std::string v;
slk::Load(&v, reader);
*value = std::move(v);
case static_cast<uint8_t>(5): {
size_t size;
slk::Load(&size, reader);
std::vector<query::TypedValue> list;
for (size_t i = 0; i < size; ++i) {
slk::Load(&list[i], reader, dba, data_manager);
*value = std::move(list);
case static_cast<uint8_t>(6): {
size_t size;
slk::Load(&size, reader);
std::map<std::string, query::TypedValue> map;
for (size_t i = 0; i < size; ++i) {
std::string key;
slk::Load(&key, reader);
slk::Load(&map[key], reader, dba, data_manager);
*value = std::move(map);
case static_cast<uint8_t>(7):
*value = slk::LoadVertexAccessor(reader, dba, data_manager);
case static_cast<uint8_t>(8):
*value = slk::LoadEdgeAccessor(reader, dba, data_manager);
case static_cast<uint8_t>(9): {
size_t v_size;
slk::Load(&v_size, reader);
auto *memory = value->GetMemoryResource();
std::vector<VertexAccessor, utils::Allocator<VertexAccessor>> vertices(
for (size_t i = 0; i < v_size; ++i) {
vertices.push_back(slk::LoadVertexAccessor(reader, dba, data_manager));
size_t e_size;
slk::Load(&e_size, reader);
std::vector<EdgeAccessor, utils::Allocator<EdgeAccessor>> edges(memory);
for (size_t i = 0; i < e_size; ++i) {
edges.push_back(slk::LoadEdgeAccessor(reader, dba, data_manager));
query::Path path(vertices[0], memory);
path.vertices() = std::move(vertices);
path.edges() = std::move(edges);
*value = std::move(path);
throw slk::SlkDecodeException("Trying to load unknown TypedValue!");
void Save(const query::Parameters &parameters, slk::Builder *builder) {
slk::Save(parameters.size(), builder);
for (auto &entry : parameters) {
slk::Save(entry, builder);
void Load(query::Parameters *parameters, slk::Reader *reader) {
size_t size = 0;
slk::Load(&size, reader);
for (size_t i = 0; i < size; ++i) {
std::pair<int, PropertyValue> entry;
slk::Load(&entry, reader);
parameters->Add(entry.first, entry.second);
void Save(const query::TypedValueVectorCompare &comparator,
slk::Builder *builder) {
slk::Save(comparator.ordering_, builder);
void Load(query::TypedValueVectorCompare *comparator, slk::Reader *reader) {
slk::Load(&comparator->ordering_, reader);
void Save(const query::GraphView &graph_view, slk::Builder *builder) {
uint8_t enum_value = 0;
switch (graph_view) {
case query::GraphView::OLD:
enum_value = 0;
case query::GraphView::NEW:
enum_value = 1;
slk::Save(enum_value, builder);
void Load(query::GraphView *graph_view, slk::Reader *reader) {
uint8_t enum_value;
slk::Load(&enum_value, reader);
switch (enum_value) {
case static_cast<uint8_t>(0):
*graph_view = query::GraphView::OLD;
case static_cast<uint8_t>(1):
*graph_view = query::GraphView::NEW;
throw slk::SlkDecodeException("Trying to load unknown enum value!");
} // namespace slk

View File

@ -1,45 +0,0 @@
#pragma once
#include "query/common.hpp"
#include "query/context.hpp"
#include "query/distributed/frontend/semantic/symbol_serialization.hpp"
#include "query/frontend/semantic/symbol_table.hpp"
#include "query/typed_value.hpp"
#include "storage/distributed/rpc/serialization.hpp"
namespace distributed {
class DataManager;
namespace slk {
inline void Save(const query::SymbolTable &symbol_table,
slk::Builder *builder) {
slk::Save(symbol_table.table_, builder);
inline void Load(query::SymbolTable *symbol_table, slk::Reader *reader) {
slk::Load(&symbol_table->table_, reader);
void Save(const query::TypedValue &value, slk::Builder *builder,
storage::SendVersions versions, int16_t worker_id);
void Load(query::TypedValue *value, slk::Reader *reader,
database::GraphDbAccessor *dba,
distributed::DataManager *data_manager);
void Save(const query::GraphView &graph_view, slk::Builder *builder);
void Load(query::GraphView *graph_view, slk::Reader *reader);
void Save(const query::TypedValueVectorCompare &comparator,
slk::Builder *builder);
void Load(query::TypedValueVectorCompare *comparator, slk::Reader *reader);
void Save(const query::Parameters &parameters, slk::Builder *builder);
void Load(query::Parameters *parameters, slk::Reader *reader);
} // namespace slk

View File

@ -684,7 +684,6 @@ TypedValue Assert(TypedValue *args, int64_t nargs, const EvaluationContext &ctx,
return TypedValue(args[0], ctx.memory);
#if defined(MG_SINGLE_NODE) || defined(MG_SINGLE_NODE_HA)
TypedValue Counter(TypedValue *args, int64_t nargs,
const EvaluationContext &context,
database::GraphDbAccessor *) {
@ -716,28 +715,6 @@ TypedValue Counter(TypedValue *args, int64_t nargs,
return TypedValue(value, context.memory);
TypedValue WorkerId(TypedValue *args, int64_t nargs,
const EvaluationContext &ctx, database::GraphDbAccessor *) {
if (nargs != 1) {
throw QueryRuntimeException("'workerId' requires exactly one argument.");
const auto &arg = args[0];
switch (arg.type()) {
case TypedValue::Type::Vertex:
return TypedValue(arg.ValueVertex().GlobalAddress().worker_id(),
case TypedValue::Type::Edge:
return TypedValue(arg.ValueEdge().GlobalAddress().worker_id(),
throw QueryRuntimeException(
"'workerId' argument must be a node or an edge.");
TypedValue Id(TypedValue *args, int64_t nargs, const EvaluationContext &ctx,
database::GraphDbAccessor *dba) {
@ -1068,12 +1045,7 @@ NameToFunction(const std::string &function_name) {
// Memgraph specific functions
if (function_name == "ASSERT") return Assert;
#if defined(MG_SINGLE_NODE) || defined(MG_SINGLE_NODE_HA)
if (function_name == "COUNTER") return Counter;
if (function_name == "WORKERID") return WorkerId;
return nullptr;

Some files were not shown because too many files have changed in this diff Show More