Clean docs folder
Summary: Removed folders: * dev/diagram * dev/distributed * feature_ref Reviewers: teon.banek, mferencevic Reviewed By: mferencevic Subscribers: pullbot Differential Revision: https://phabricator.memgraph.io/D2710
This commit is contained in:
parent
000d6dba55
commit
32b35f6f88
@ -1,288 +0,0 @@
|
||||
// dot -Tpng dependencies.dot -o /path/to/output.png
|
||||
|
||||
// TODO (buda): Put PropertyValueStore to storage namespace
|
||||
|
||||
digraph {
|
||||
// At the beginning of each block there is a default style for that block
|
||||
label="Memgraph Dependencies Diagram"; fontname="Roboto Bold"; fontcolor=black;
|
||||
fontsize=26; labelloc=top; labeljust=right;
|
||||
compound=true; // If true, allow edges between clusters
|
||||
rankdir=TB; // Alternatives: LR
|
||||
node [shape=record fontname="Roboto", fontsize=12, fontcolor=white];
|
||||
edge [color="#B5AFB7"];
|
||||
|
||||
// -- Legend --
|
||||
// dir=both arrowtail=diamond arrowhead=vee -> group ownership
|
||||
// dir=both arrowtail=none, arrowhead=vee -> ownership; stack or uptr
|
||||
|
||||
subgraph cluster_tcp_end_client_communication {
|
||||
label="TCP End Client Communication"; fontsize=14;
|
||||
node [style=filled, color="#DD2222" fillcolor="#DD2222"];
|
||||
|
||||
// Owned elements
|
||||
"communication::Server";
|
||||
"io::network::Socket";
|
||||
|
||||
// Intracluster connections
|
||||
"communication::Server" -> "io::network::Socket"
|
||||
[label="socket_" dir=both arrowtail=none arrowhead=vee];
|
||||
}
|
||||
|
||||
subgraph cluster_bolt_server {
|
||||
label="Bolt Server"; fontsize=14;
|
||||
node [style=filled, color="#62A2CA" fillcolor="#62A2CA"];
|
||||
|
||||
// Owned elements
|
||||
"communication::bolt::SessionData";
|
||||
"communication::bolt::Session";
|
||||
"communication::bolt::Encoder";
|
||||
"communication::bolt::Decoder";
|
||||
|
||||
// Intracluster connections
|
||||
"communication::bolt::Session" -> "communication::bolt::Encoder"
|
||||
[label="encoder_", dir=both arrowtail=none, arrowhead=vee];
|
||||
"communication::bolt::Session" -> "communication::bolt::Decoder"
|
||||
[label="decoder_", dir=both arrowtail=none, arrowhead=vee];
|
||||
}
|
||||
|
||||
subgraph cluster_opencypher_engine {
|
||||
label="openCypher Engine"; fontsize=14;
|
||||
node [style=filled, color="#68BDF6" fillcolor="#68BDF6"];
|
||||
|
||||
// Owned Elements
|
||||
"query::Interpreter";
|
||||
"query::AstTreeStorage";
|
||||
"query::TypedValue"
|
||||
"query::Path";
|
||||
"query::Simbol";
|
||||
"query::Context";
|
||||
"query::ExpressionEvaluator";
|
||||
"query::Frame";
|
||||
"query::SymbolTable";
|
||||
"query::plan::LogicalOperator";
|
||||
"query::plan::Cursor";
|
||||
"query::plan::CostEstimator";
|
||||
|
||||
// Intracluster connections
|
||||
"query::Interpreter" -> "query::AstTreeStorage"
|
||||
[label="ast_cache" dir=both arrowtail=diamond arrowhead=vee];
|
||||
"query::TypedValue" -> "query::Path";
|
||||
"query::plan::Cursor" -> "query::Frame";
|
||||
"query::plan::Cursor" -> "query::Context";
|
||||
"query::plan::LogicalOperator" -> "query::Symbol";
|
||||
"query::plan::LogicalOperator" -> "query::SymbolTable";
|
||||
"query::plan::LogicalOperator" -> "query::plan::Cursor";
|
||||
}
|
||||
|
||||
|
||||
subgraph cluster_storage {
|
||||
label="Storage" fontsize=14;
|
||||
node [style=filled, color="#FB6E00" fillcolor="#FB6E00"];
|
||||
|
||||
// Owned Elements
|
||||
"database::GraphDb";
|
||||
"database::GraphDbAccessor";
|
||||
"storage::Record";
|
||||
"storage::Vertex";
|
||||
"storage::Edge";
|
||||
"storage::RecordAccessor";
|
||||
"storage::VertexAccessor";
|
||||
"storage::EdgeAccessor";
|
||||
"storage::Common";
|
||||
"storage::Label";
|
||||
"storage::EdgeType";
|
||||
"storage::Property";
|
||||
"storage::compression";
|
||||
"storage::SingleNodeConcurrentIdMapper";
|
||||
"storage::Location";
|
||||
"storage::StorageTypesLocation";
|
||||
"PropertyValueStore";
|
||||
"storage::RecordLock";
|
||||
"mvcc::Version";
|
||||
"mvcc::Record";
|
||||
"mvcc::VersionList";
|
||||
|
||||
// Intracluster connections
|
||||
"storage::VertexAccessor" -> "storage::RecordAccessor"
|
||||
[arrowhead=onormal];
|
||||
"storage::EdgeAccessor" -> "storage::RecordAccessor"
|
||||
[arrowhead=onormal];
|
||||
"storage::RecordAccessor" -> "database::GraphDbAccessor"
|
||||
[style=dashed arrowhead=vee];
|
||||
"storage::Vertex" -> "mvcc::Record"
|
||||
[arrowhead=onormal];
|
||||
"storage::Edge" -> "mvcc::Record"
|
||||
[arrowhead=onormal];
|
||||
"storage::Edge" -> "PropertyValueStore"
|
||||
[arrowhead=vee];
|
||||
"storage::Vertex" -> "PropertyValueStore"
|
||||
[arrowhead=vee];
|
||||
"storage::Edge" -> "mvcc::VersionList"
|
||||
[label="from,to" arrowhead=vee style=dashed];
|
||||
"storage::VertexAccessor" -> "storage::Vertex"
|
||||
[arrowhead=vee];
|
||||
"storage::EdgeAccessor" -> "storage::Edge"
|
||||
[arrowhead=vee];
|
||||
"storage::SingleNodeConcurrentIdMapper" -> "storage::StorageTypesLocation"
|
||||
[arrowhead=vee];
|
||||
"storage::StorageTypesLocation" -> "storage::Location"
|
||||
[arrowhead=vee];
|
||||
"storage::Storage" -> "storage::StorageTypesLocation"
|
||||
[arrowhead=vee];
|
||||
"storage::Property" -> "storage::Common"
|
||||
[arrowhead=onormal];
|
||||
"storage::Label" -> "storage::Common"
|
||||
[arrowhead=onormal];
|
||||
"storage::EdgeType" -> "storage::Common"
|
||||
[arrowhead=onormal];
|
||||
"storage::Property" -> "storage::Location"
|
||||
[arrowhead=vee];
|
||||
"PropertyValueStore" -> "storage::Property"
|
||||
[arrowhead=vee];
|
||||
"PropertyValueStore" -> "storage::Location"
|
||||
[arrowhead=vee];
|
||||
"database::GraphDbAccessor" -> "database::GraphDb"
|
||||
[arrowhead=vee];
|
||||
"database::GraphDbAccessor" -> "tx::TransactionId"
|
||||
[arrowhead=vee];
|
||||
"mvcc::VersionList" -> "storge::RecordLock"
|
||||
[label="lock" arrowhead=vee];
|
||||
"mvcc::VersionList" -> "mvcc::Record"
|
||||
[label="head" arrowhead=vee];
|
||||
"mvcc::Record" -> "mvcc::Version"
|
||||
[arrowhead=onormal];
|
||||
|
||||
// Explicit positioning
|
||||
{rank=same;
|
||||
"database::GraphDbAccessor";
|
||||
"storage::VertexAccessor";
|
||||
"storage::EdgeAccessor";}
|
||||
{rank=same;
|
||||
"storage::Common";
|
||||
"storage::compression";}
|
||||
}
|
||||
|
||||
subgraph cluster_properties_on_disk {
|
||||
label="Properties on Disk" fontsize=14;
|
||||
node [style=filled, color="#102647" fillcolor="#102647"];
|
||||
|
||||
// Owned Elements
|
||||
"storage::KVStore";
|
||||
"rocksdb";
|
||||
|
||||
// Intracluster connections
|
||||
"storage::KVStore" -> "rocksdb";
|
||||
}
|
||||
|
||||
subgraph cluster_distributed {
|
||||
label="Distributed" fontsize=14;
|
||||
node [style=filled, color="#FFC500" fillcolor="#FFC500"];
|
||||
|
||||
// Owned Elements
|
||||
"distributed::DataManager";
|
||||
"distributed::DataRpcClients";
|
||||
|
||||
// Intracluster connections
|
||||
"distributed::DataManager" -> "distributed::DataRpcClients"
|
||||
[arrowhead=vee];
|
||||
"storage::RecordAccessor" -> "distributed::DataManager"
|
||||
[style=dashed arrowhead=vee];
|
||||
}
|
||||
|
||||
subgraph cluster_dynamic_partitioning {
|
||||
label="Dynamic Partitioning" fontsize=14;
|
||||
node [style=filled, color="#720096" fillcolor="#720096"];
|
||||
|
||||
// Owned Elements
|
||||
"DynamicPartitioner";
|
||||
}
|
||||
|
||||
subgraph cluster_security {
|
||||
label="Security" fontsize=14;
|
||||
node [style=filled, color="#857F87" fillcolor="#857F87"];
|
||||
|
||||
// Owned Elements
|
||||
"Communication Encryption";
|
||||
"Data Encryption";
|
||||
"Access Control";
|
||||
"Audit Logging";
|
||||
}
|
||||
|
||||
subgraph cluster_web_dashboard {
|
||||
label="Dashaboard" fontsize=14;
|
||||
node [style=filled, color="#FF0092" fillcolor="#FF0092"];
|
||||
|
||||
// Owned Elements
|
||||
"Memgraph Ops / Memgraph Cockpit";
|
||||
}
|
||||
|
||||
subgraph cluster_rpc {
|
||||
label="RPC" fontsize=14;
|
||||
node [style=filled, color="#857F87" fillcolor="#857F87"];
|
||||
|
||||
// Owned Elements
|
||||
"communication::rpc::Server";
|
||||
"communication::rpc::Client";
|
||||
}
|
||||
|
||||
subgraph cluster_ingestion {
|
||||
label="Ingestion" fontsize=14;
|
||||
node [style=filled, color="#0B6D88" fillcolor="#0B6D88"];
|
||||
|
||||
// Owned Elements
|
||||
"Extract";
|
||||
"Transform";
|
||||
"Load";
|
||||
"Amazon S3";
|
||||
"Kafka";
|
||||
|
||||
// Intracluster connections
|
||||
"Extract" -> "Amazon S3";
|
||||
"Extract" -> "Kafka";
|
||||
|
||||
// Explicit positioning
|
||||
{rank=same;"Extract";"Transform";"Load";}
|
||||
}
|
||||
|
||||
// -- Intercluster connections --
|
||||
// cluster_tcp_end_client_communication -- cluster_bolt_server
|
||||
"communication::Server" -> "communication::bolt::SessionData" [color=black];
|
||||
"communication::Server" -> "communication::bolt::Session" [color=black];
|
||||
// cluster_bolt_server -> cluster_storage
|
||||
"communication::bolt::SessionData" -> "database::GraphDb" [color=red];
|
||||
"communication::bolt::Session" -> "database::GraphDbAccessor" [color=red];
|
||||
// cluster_bolt_server -> cluster_opencypher_engine
|
||||
"communication::bolt::SessionData" -> "query::Interpreter" [color=red];
|
||||
// cluster_opencypher_engine -- cluster_storage
|
||||
"query::Interpreter" -> "database::GraphDbAccessor" [color=black];
|
||||
"query::Interpreter" -> "storage::VertexAccessor" [color=black];
|
||||
"query::Interpreter" -> "storage::EdgeAccessor" [color=black];
|
||||
"query::TypedValue" -> "storage::VertexAccessor" [color=black];
|
||||
"query::TypedValue" -> "storage::EdgeAccessor" [color=black];
|
||||
"query::Path" -> "storage::VertexAccessor"
|
||||
[label="vertices" dir=both arrowtail=diamond arrowhead=vee color=black];
|
||||
"query::Path" -> "storage::EdgeAccessor"
|
||||
[label="edges" dir=both arrowtail=diamond arrowhead=vee color=black];
|
||||
"query::plan::LogicalOperator" -> "database::GraphDbAccessor"
|
||||
[color=black arrowhead=vee];
|
||||
// cluster_distributed -- cluster_storage
|
||||
"distributed::DataManager" -> "database::GraphDb"
|
||||
[arrowhead=vee style=dashed color=red];
|
||||
"distributed::DataManager" -> "tx::TransactionId"
|
||||
[label="ves_caches_key" dir=both arrowhead=none arrowtail=diamond
|
||||
color=red];
|
||||
"distributed::DataManager" -> "storage::Vertex"
|
||||
[label="vertices_caches" dir=both arrowhead=none arrowtail=diamond
|
||||
color=red];
|
||||
"distributed::DataManager" -> "storage::Edge"
|
||||
[label="edges_caches" dir=both arrowhead=none arrowtail=diamond
|
||||
color=red];
|
||||
// cluster_storage -- cluster_properties_on_disk
|
||||
"PropertyValueStore" -> "storage::KVStore"
|
||||
[label="static" arrowhead=vee color=black];
|
||||
// cluster_dynamic_partitioning -- cluster_storage
|
||||
"database::GraphDb" -> "DynamicPartitioner"
|
||||
[arrowhead=vee color=red];
|
||||
"DynamicPartitioner" -> "database::GraphDbAccessor"
|
||||
[arrowhead=vee color=black];
|
||||
}
|
@ -1,22 +0,0 @@
|
||||
digraph {
|
||||
// label="Dynamig Graph Partitioning";
|
||||
fontname="Roboto Bold"; fontcolor=black;
|
||||
fontsize=26; labelloc=top; labeljust=center;
|
||||
compound=true; // If true, allow edges between clusters
|
||||
rankdir=TB; // Alternatives: LR
|
||||
node [shape=record fontname="Roboto", fontsize=12, fontcolor=white
|
||||
style=filled, color="#FB6E00" fillcolor="#FB6E00"];
|
||||
edge [color="#B5AFB7"];
|
||||
|
||||
"distributed::DistributedGraphDb" -> "distributed::TokenSharingRpcServer";
|
||||
|
||||
"distributed::TokenSharingRpcServer" -> "communication::rpc::Server";
|
||||
"distributed::TokenSharingRpcServer" -> "distributed::Coordination";
|
||||
"distributed::TokenSharingRpcServer" -> "distributed::TokenSharingRpcClients";
|
||||
"distributed::TokenSharingRpcServer" -> "distributed::dgp::Partitioner";
|
||||
|
||||
"distributed::dgp::Partitioner" -> "distributed::DistributedGraphDb" [style=dashed];
|
||||
|
||||
"distributed::dgp::Partitioner" -> "distributed::dgp::VertexMigrator";
|
||||
"distributed::dgp::VertexMigrator" -> "database::GraphDbAccessor" [style=dashed];
|
||||
}
|
@ -1,43 +0,0 @@
|
||||
# Distributed addressing
|
||||
|
||||
In distributed Memgraph a single graph element must be owned by exactly
|
||||
one worker. It is possible that multiple workers have cached copies of
|
||||
a single graph element (which is inevitable), but there is only one
|
||||
owner.
|
||||
|
||||
The owner of a graph element can change. This is not yet implemented,
|
||||
but is intended. Graph partitioning is intended to be dynamic.
|
||||
|
||||
Graph elements refer to other graph elements that are possibly on some
|
||||
other worker. Even though each graph element is identified with a unique
|
||||
ID, that ID does not contain the information about where that element
|
||||
currently resides (which worker is the owner).
|
||||
|
||||
Thus we introduce the concept of a global address. It indicates both
|
||||
which graph element is referred to (it's global ID), and where it
|
||||
resides. Semantically it's a pair of two elements, but for efficiency
|
||||
it's stored in 64 bits.
|
||||
|
||||
The global address is efficient for usage in a cluster: it indicates
|
||||
where something can be found. However, finding a graph element based on
|
||||
it's ID is still not a free operation (in the current implementation
|
||||
it's a skiplist lookup). So, whenever possible, it's better to use local
|
||||
addresses (pointers).
|
||||
|
||||
Succinctly, the requirements for addressing are:
|
||||
- global addressing containing location info
|
||||
- fast local addressing
|
||||
- storage of both types in the same location efficiently
|
||||
- translation between the two
|
||||
|
||||
The `storage::Address` class handles the enumerated storage
|
||||
requirements. It stores either a local or global address in the size of
|
||||
a local pointer (typically 8 bytes).
|
||||
|
||||
Conversion between the two is done in multiple places. The general
|
||||
approach is to use local addresses (when possible) only for local
|
||||
in-memory handling. All the communication and persistence uses global
|
||||
addresses. Also, when receiving address from another worker, attempt to
|
||||
localize addresses as soon as possible, so that least code has to worry
|
||||
about potential inefficiency of using a global address for a local graph
|
||||
element.
|
@ -1,50 +0,0 @@
|
||||
# Distributed durability
|
||||
|
||||
Durability in distributed is slightly different then in single-node as
|
||||
the state itself is shared between multiple workers and none of those
|
||||
states are independent.
|
||||
|
||||
Note that recovering from persistent storage must result in a stable
|
||||
database state. This means that across the cluster the state
|
||||
modification of every transaction that was running is either recovered
|
||||
fully or not at all. Also, if transaction A committed before transaction B,
|
||||
then if B is recovered so must A.
|
||||
|
||||
## Snapshots
|
||||
|
||||
It is possibly avoidable but highly desirable that the database can be
|
||||
recovered from snapshot only, without relying on WAL files. For this to
|
||||
be possible in distributed, it must be ensured that the same
|
||||
transactions are recovered on all the workers (including master) in the
|
||||
cluster. Since the snapshot does not contain information about which
|
||||
state change happened in which transaction, the only way to achieve this
|
||||
is to have synchronized snapshots. This means that the process of
|
||||
creating a snapshot, which is in itself transactional (it happens within
|
||||
a transaction and thus observes some consistent database state), must
|
||||
happen in the same transaction. This is achieved by the master starting
|
||||
a snapshot generating transaction and triggering the process on all
|
||||
workers in the cluster.
|
||||
|
||||
## WAL
|
||||
|
||||
Unlike the snapshot, write-ahead logs contain the information on which
|
||||
transaction made which state change. This makes it possible to include
|
||||
or exclude transactions during the recovery process. What is necessary
|
||||
however is a global consensus on which of the transactions should be
|
||||
recovered and which not, to ensure recovery into a consistent state.
|
||||
|
||||
It would be possible to achieve this with some kind of synchronized
|
||||
recovery process, but it would impose constraints on cluster startup and
|
||||
would not be trivial.
|
||||
|
||||
A simpler alternative is that the consensus is achieved beforehand,
|
||||
while the database (to be recovered) is still operational. What is
|
||||
necessary is to keep track of which transactions are guaranteed to
|
||||
have been flushed to the WAL files on all the workers in the cluster. It
|
||||
makes sense to keep this record on the master, so a mechanism is
|
||||
introduced which periodically pings all the workers, telling them to
|
||||
flush their WALs, and writes some sort of a log indicating that this has
|
||||
been confirmed. The downside of this is a periodic broadcast must be
|
||||
done, and that potentially slightly less data can be recovered in the
|
||||
case of a crash then if using a post-crash consensus. It is however much
|
||||
simpler to implement.
|
@ -1,51 +0,0 @@
|
||||
## Dynamic Graph Partitioning
|
||||
|
||||
Memgraph supports dynamic graph partitioning similar to the Spinner algorithm,
|
||||
mentioned in this paper: [https://arxiv.org/pdf/1404.3861.pdf].
|
||||
|
||||
Dgp is useful because it tries to group `local` date on the same worker, i.e.
|
||||
it tries to keep closely connected data on one worker. It tries to avoid jumps
|
||||
across workers when querying/traversing the distributed graph.
|
||||
|
||||
### Our implementation
|
||||
|
||||
It works independently on each worker but it is always running the migration
|
||||
on only one worker at the same time. It achieves that by sharing a token
|
||||
between workers, and the token ownership is transferred to the next worker
|
||||
when the current worker finishes its migration step.
|
||||
|
||||
The reason that we want workers to work in disjoint time slots is it avoid
|
||||
serialization errors caused by creating/removing edges of vertices during
|
||||
migrations, which might cause an update of some vertex from two or more
|
||||
different transactions.
|
||||
|
||||
### Migrations
|
||||
|
||||
For each vertex and workerid (label in the context of Dgp algorithm) we define
|
||||
a score function. Score function takes into account labels of surrounding
|
||||
endpoints of vertex edges (in/out) and the capacity of the worker with said
|
||||
label. Score function loosely looks like this
|
||||
```
|
||||
locality(v, l) =
|
||||
count endpoints of edges of vertex `v` with label `l` / degree of `v`
|
||||
|
||||
capacity(l) =
|
||||
number of vertices on worker `l` divided by the worker capacity
|
||||
(usually equal to the average number of vertices per worker)
|
||||
|
||||
score(v, l) = locality(v, l) - capacity(l)
|
||||
```
|
||||
We also define two flags alongside ```dynamic_graph_partitioner_enabled```,
|
||||
```dgp_improvement_threshold``` and ```dgp_max_batch_size```.
|
||||
|
||||
These two flags are used during the migration phase.
|
||||
When deciding if we need to migrate some vertex `v` from worker `l1` to worker
|
||||
`l2` we examine the difference in scores, i.e.
|
||||
if score(v, l1) - dgp_improvement_threshold / 100 < score(v, l2) then we
|
||||
migrate the vertex.
|
||||
|
||||
Max batch size flag limits the number of vertices we can transfer in one batch
|
||||
(one migration step).
|
||||
Setting this value to a too large value will probably cause
|
||||
a lot of interference with client queries, and having it a small value
|
||||
will slow down convergence of the algorithm.
|
@ -1,54 +0,0 @@
|
||||
# Memgraph distributed
|
||||
|
||||
This chapter describes some of the concepts used in distributed
|
||||
Memgraph. By "distributed" here we mean the sharding of a single graph
|
||||
onto multiple processing units (servers).
|
||||
|
||||
## Conceptual organization
|
||||
|
||||
There is a single master and multiple workers. The master contains all
|
||||
the global sources of truth (transaction engine,
|
||||
[label|edge-type|property] to name mappings). Also, in the current
|
||||
organization it is the only one that contains a Bolt server (for
|
||||
communication with the end client) and an interpretation engine. Workers
|
||||
contain the data and means of subquery interpretation (query plans
|
||||
recieved from the master) and means of communication with the master and
|
||||
other workers.
|
||||
|
||||
In many query plans the load on the master is much larger then the load
|
||||
on the workers. For that reason it might be beneficial to make the
|
||||
master contain less data (or none at all), and/or having multiple
|
||||
interpretation masters.
|
||||
|
||||
## Logic organization
|
||||
|
||||
Both the distributed and the single node Memgraph use the same codebase.
|
||||
In cases where the behavior in single-node differs from that in
|
||||
distributed, some kind of dynamic behavior change is implemented (either
|
||||
through inheritance or conditional logic).
|
||||
|
||||
### GraphDb
|
||||
|
||||
The `database::GraphDb` is an "umbrella" object for parts of the
|
||||
database such as storage, garbage collection, transaction engine etc.
|
||||
There is a class heirarchy of `GraphDb` implementations, as well as a
|
||||
base interface object. There are subclasses for single-node, master and
|
||||
worker deplotyments. Which implementation is used depends on the
|
||||
configuration processed in the `main` entry point of memgraph.
|
||||
|
||||
The `GraphDb` interface exposes getters to base classes of
|
||||
other similar heirarchies (for example to `tx::Engine`). In that way
|
||||
much of the code that uses those objects (for example query plan
|
||||
interpretation) is agnostic to the type of deployment.
|
||||
|
||||
### RecordAccessors
|
||||
|
||||
The functionality of `RecordAccessors` and it's subclasses is already
|
||||
documented. It's important to note that the same implementation of
|
||||
accessors is used in all deployments, with internal changes of behavior
|
||||
depending on the locality of the graph element (vertex or edge) the
|
||||
accessor represents. For example, if the graph element is local, an
|
||||
update operation on an accessor will make the necessary MVCC ops, update
|
||||
local data, indexes, the write-ahead log etc. However, if the accessor
|
||||
represents a remote graph element, an update will trigger an RPC message
|
||||
to the owner about the update and a change in the local cache.
|
@ -1,103 +0,0 @@
|
||||
# Distributed updates
|
||||
|
||||
Operations that modify the graph state are somewhat more complex in the
|
||||
distributed system, as opposed to a single-node Memgraph deployment. The
|
||||
complexity arises from two factors.
|
||||
|
||||
First, the data being modified is not necessarily owned by the worker
|
||||
performing the modification. This situation is completely valid workers
|
||||
execute parts of the query plan and parts must be executed by the
|
||||
master.
|
||||
|
||||
Second, there are less guarantees regarding multi-threaded access. In
|
||||
single-node Memgraph it was guaranteed that only one transaction will be
|
||||
performing database work in a single transaction. This implied that
|
||||
per-version storage could be thread-unsafe. In distributed Memgraph it
|
||||
is possible that multiple threads could be performing work in the same
|
||||
transaction as a consequence of the query being executed at the same
|
||||
time on multiple workers and those executions interacting with the
|
||||
globally partitioned database state.
|
||||
|
||||
## Deferred state modification
|
||||
|
||||
Making the per-version data storage thread-safe would most likely have a
|
||||
performance impact very undesirable in a transactional database intended
|
||||
for high throughput.
|
||||
|
||||
An alternative is that state modification over unsafe structures is not
|
||||
performed immediately when requested, but postponed until it is safe to
|
||||
do (there is is a guarantee of no concurrent access).
|
||||
|
||||
Since local query plan execution is done the same way on local data as
|
||||
it is in single-node Memgraph, it is not possible to deffer that part of
|
||||
the modification story. What can be deferred are modifications requested
|
||||
by other workers. Since local query plan execution still is
|
||||
single-threaded, this approach is safe.
|
||||
|
||||
At the same time those workers requesting the remote update can update
|
||||
local copies (caches) of the not-owned data since that cache is only
|
||||
being used by the single, local-execution thread.
|
||||
|
||||
### Visibility
|
||||
|
||||
Since updates are deferred the question arises: when do the updates
|
||||
become visible? The above described process offers the following
|
||||
visibility guarantees:
|
||||
- updates done on the local state are visible to the owner
|
||||
- updates done on the local state are NOT visible to anyone else during
|
||||
the same (transaction + command)
|
||||
- updates done on remote state are deferred on the owner and not
|
||||
visible to the owner until applied
|
||||
- updates done on the remote state are applied immediately to the local
|
||||
caches and thus visible locally
|
||||
|
||||
This implies an inconsistent view of the database state. In a concurrent
|
||||
execution of a single query this can hardly be avoided and is accepted
|
||||
as such. It does not change the Cypher query execution semantic in any
|
||||
of the well-defined scenarios. It possibly changes some of the behaviors
|
||||
in which the semantic is not well defined even in single-node execution.
|
||||
|
||||
### Synchronization, update application
|
||||
|
||||
In many queries it is mandatory to observe the latest global graph state
|
||||
(typically when returning it to the client). That means that before that
|
||||
happens all the deferred updates need to be applied, and all the caches
|
||||
to remote data invalidated. Exactly this happens when executing queries
|
||||
that modify the graph state. At some point a global synchronization
|
||||
point is reached. First it is waited that all workers finish the
|
||||
execution of query plan parts performing state modifications. After that
|
||||
all the workers are told to apply the deferred updates they received to
|
||||
their graph state. Since there is no concurrent query plan execution,
|
||||
this is safe. Once that is done all the local caches are cleared and the
|
||||
requested data can be returned to the client.
|
||||
|
||||
### Command advancement
|
||||
|
||||
In complex queries where a read part follows a state modification part
|
||||
the synchronization process after the state modification part is
|
||||
followed by command advancement, like in single-node execution.
|
||||
|
||||
## Creation
|
||||
|
||||
Graph element creation is not deferred. This is practical because the
|
||||
response to a creation is the global ID of the newly created element. At
|
||||
the same time it is safe because no other worker (including the owner)
|
||||
will be using the newly added graph element.
|
||||
|
||||
## Updating
|
||||
|
||||
Updating is deferred, as described. Note that this also means that
|
||||
record locking conflicts are deferred and serialization errors
|
||||
(including lock timeouts) are postponed until the deferred update
|
||||
application phase. In certain scenarios it might be beneficial to force
|
||||
these errors to happen earlier, when the deferred update request is
|
||||
processed.
|
||||
|
||||
## Deletion
|
||||
|
||||
Deletion is also deferred. Deleting an edge implies a modification of
|
||||
it's endpoint vertices, which must be deferred as those data structures
|
||||
are not thread-safe. Deleting a vertex is either with detaching, in
|
||||
which case an arbitrary number of updates are implied in the vertex's
|
||||
neighborhood, or without detaching which relies on checking the current
|
||||
state of the graph which is generally impossible in distributed.
|
@ -1,20 +0,0 @@
|
||||
## Dynamic Graph Partitioner
|
||||
|
||||
Memgraph supports dynamic graph partitioning which dynamically improves
|
||||
performance on badly partitioned dataset over workers. To enable it, the user
|
||||
should use the following flag when firing up the *master* node:
|
||||
|
||||
```plaintext
|
||||
--dynamic_graph_partitioner_enable
|
||||
```
|
||||
|
||||
### Parameters
|
||||
|
||||
| Name | Default Value | Description | Range |
|
||||
|------|---------------|-------------|-------|
|
||||
|--dgp_improvement_threshold | 10 | How much better should specific node score
|
||||
be to consider a migration to another worker. This represents the minimal
|
||||
difference between new score that the vertex will have when migrated and the
|
||||
old one such that it's migrated. | Min: 1, Max: 100
|
||||
|--dgp_max_batch_size | 2000 | Maximal amount of vertices which should be
|
||||
migrated in one dynamic graph partitioner step. | Min: 1, Max: MaxInt32 |
|
Loading…
Reference in New Issue
Block a user