From d62feb56fc5a6174c10cb1105a5f9409b5214393 Mon Sep 17 00:00:00 2001
From: Marin Tomic <marin.tomic@memgraph.io>
Date: Wed, 25 Oct 2017 18:02:44 +0200
Subject: [PATCH] Some tests for the Elliott Management use case.

Reviewers: buda, teon.banek

Reviewed By: teon.banek

Subscribers: pullbot

Differential Revision: https://phabricator.memgraph.io/D924
---
 customers/elliott/README.md    |  70 ++++++++++++++++++
 customers/elliott/commands.txt |  12 ++++
 customers/elliott/generate_dag | 125 +++++++++++++++++++++++++++++++++
 customers/elliott/graph1.json  |  39 ++++++++++
 customers/elliott/graph2.json  |  39 ++++++++++
 customers/elliott/graph3.json  |  39 ++++++++++
 6 files changed, 324 insertions(+)
 create mode 100644 customers/elliott/README.md
 create mode 100644 customers/elliott/commands.txt
 create mode 100755 customers/elliott/generate_dag
 create mode 100644 customers/elliott/graph1.json
 create mode 100644 customers/elliott/graph2.json
 create mode 100644 customers/elliott/graph3.json

diff --git a/customers/elliott/README.md b/customers/elliott/README.md
new file mode 100644
index 000000000..cbddc4864
--- /dev/null
+++ b/customers/elliott/README.md
@@ -0,0 +1,70 @@
+DISCLAIMER: this is just an initial test, graph might not resemble
+the graph in the use case at all and the data might be completely
+irrelevant.
+
+We tried generating a few sample graphs from the vague description
+given in the use case doc. Then we tried writing queries that would
+solve the problem of updating nodes when a leaf value changes,
+assuming all the internal nodes compute only the sum function.
+
+We start by creating an index on `id` property to improve initial lookup
+performance:
+
+    CREATE INDEX ON :Leaf(id)
+
+Set values of all leafs to 1:
+
+    MATCH (u:Leaf) SET u.value = 1
+
+Now we initialize the values of all other nodes in the graph:
+
+    MATCH (u) WHERE NOT u:Leaf SET u.value = 0
+
+    MATCH (u) WITH u
+    ORDER BY u.topological_index DESC
+    MATCH (u)-->(v) SET u.value = u.value + v.value
+
+Change the value of a leaf:
+
+    MATCH (u:Leaf {id: "9"}) SET u.value = 10
+
+We have to reset all the updated nodes to a neutral element:
+
+    MATCH (u:Leaf {id: "18"})<-[* bfs]-(v)
+    WHERE NOT v:Leaf SET v.value = 0
+
+Finally, we recalculate their values in topological order:
+
+    MATCH (u:Leaf {id: "18"})<-[* bfs]-(v)
+    WITH v ORDER BY v.topological_index DESC
+    MATCH (v)-->(w) SET v.value = v.value + w.value
+
+There are a few assumptions made worth pointing out.
+
+* We are able to efficiently maintain topological order
+  of vertices in the graph.
+
+* It is possible to accumulate the value of the function.  Formally: 
+  $$f(x_1, x_2, ..., x_n) = g(...(g(g(x_1, x_2), x_3), ...), x_n).$$
+
+* There is a neutral element for the operation. However, this
+  assumption can be dropped by introducing an artificial neutral element.
+
+Number of operations required is proportional to sum of degrees of affected
+nodes.
+
+We generated graph with $10^5$ nodes ($20\ 000$ nodes in each layer),  varied the
+degree distribution in node layers and measured time for the query to execute:
+
+| # |  Root-Category-Group degree | Group-CustomGroup-Leaf degree |   Time    |
+|:-:|:---------------------------:|:-----------------------------:|:---------:|
+| 1 |            [1, 10]          |           [20, 40]            |   ~1.1s   |
+| 2 |            [1, 10]          |          [50, 100]            |   ~2.5s   |
+| 3 |           [10, 50]          |          [50, 100]            |   ~3.3s   |
+
+Due to the structure of the graph, update of a leaf required update of almost
+all the nodes in the graph so we don't show times required for initial graph
+update and update after leaf change separately.
+
+However, there is not enough info on the use case to make the test more
+sophisticated.
diff --git a/customers/elliott/commands.txt b/customers/elliott/commands.txt
new file mode 100644
index 000000000..cd219ac31
--- /dev/null
+++ b/customers/elliott/commands.txt
@@ -0,0 +1,12 @@
+CREATE INDEX ON :Leaf(id);
+MATCH (u:Leaf) SET u.value = 1;
+MATCH (u) WHERE NOT u:Leaf SET u.value = 0;
+MATCH (u) WITH u 
+ORDER BY u.topological_index DESC
+MATCH (u)-->(v) SET u.value = u.value + v.value;
+MATCH (u:Leaf {id: "85000"}) SET u.value = 10;
+MATCH (u:Leaf {id: "85000"})<-[* bfs]-(v) 
+WHERE NOT v:Leaf SET v.value = 0;
+MATCH (u:Leaf {id: "85000"})<-[* bfs]-(v)
+WITH v ORDER BY v.topological_index DESC
+MATCH (v)-->(w) SET v.value = v.value + w.value;
diff --git a/customers/elliott/generate_dag b/customers/elliott/generate_dag
new file mode 100755
index 000000000..2cdbb6d67
--- /dev/null
+++ b/customers/elliott/generate_dag
@@ -0,0 +1,125 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""Generates a DAG from JSON spec in [config] and outputs nodes to
+[filename]_nodes, and edges to [filename]_edges in format convertible
+to Memgraph snapshot.
+
+Here's an example JSON spec:
+
+    {
+        "layers": [
+            {
+                "name": "A",
+                "sublayers": 1,
+                "degree_lo": 1,
+                "degree_hi": 3,
+                "nodes": 4
+            },
+            {
+                "name": "B",
+                "sublayers": 3,
+                "degree_lo": 2,
+                "degree_hi": 3,
+                "nodes": 10
+            },
+            {
+                "name": "C",
+                "sublayers": 1,
+                "degree_lo": 1,
+                "degree_hi": 1,
+                "nodes": 5
+            }
+        ]
+    }
+
+Nodes from each layer will be randomly divided into sublayers.  A node can
+only have edges pointing to nodes in lower sublayers of the same layer, or
+to nodes from the layer directly below it. Out-degree is chosen uniformly
+random from [degree_lo, degree_hi] interval."""
+
+import argparse
+from itertools import accumulate
+import json
+import random
+
+
+def _split_into_sum(n, k):
+    assert 1 <= n, "n should be at least 1"
+    assert k <= n, "k shouldn't be greater than n"
+    xs = [0] + sorted(random.sample(range(1, n), k-1)) + [n]
+    return [b - a for a, b in zip(xs, xs[1:])]
+
+
+def generate_dag(graph_config, seed=None):
+    random.seed(seed)
+
+    nodes = []
+    edges = []
+
+    layer_lo = 1
+    for layer in graph_config:
+        sublayers = _split_into_sum(layer['nodes'], layer['sublayers'])
+        sub_range = accumulate([layer_lo] + sublayers)
+        layer['sublayer_range'] = list(sub_range)
+        nodes.extend([
+            (u, layer['name'])
+            for u in range(layer_lo, layer_lo + layer['nodes'])
+        ])
+        layer_lo += layer['nodes']
+
+    edges = []
+
+    for layer, next_layer in zip(graph_config, graph_config[1:]):
+        degree_lo = layer['degree_lo']
+        degree_hi = layer['degree_hi']
+
+        sub_range = layer['sublayer_range']
+        sub_range_next = next_layer['sublayer_range']
+
+        layer_lo = sub_range[0]
+        next_layer_hi = sub_range_next[-1]
+
+        for sub_lo, sub_hi in zip(sub_range, sub_range[1:]):
+            for u in range(sub_lo, sub_hi):
+                num_edges = random.randint(degree_lo, degree_hi)
+                for _ in range(num_edges):
+                    v = random.randint(sub_hi, next_layer_hi - 1)
+                    edges.append((u, v))
+
+        for sub_lo, sub_hi in zip(sub_range_next, sub_range_next[1:]):
+            for u in range(sub_lo, sub_hi):
+                v = random.randint(layer_lo, sub_lo - 1)
+                edges.append((v, u))
+
+    return nodes, edges
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        description=__doc__)
+    parser.add_argument('config', type=str, help='graph config JSON file')
+    parser.add_argument('filename', type=str,
+                        help='nodes will be stored to filename_nodes, '
+                        'edges to filename_edges')
+    parser.add_argument('--seed', type=int,
+                        help='seed for the random generator (default = '
+                        'current system time)')
+    args = parser.parse_args()
+
+    with open(args.config, 'r') as f:
+        graph_config = json.loads(f.read())['layers']
+
+    nodes, edges = generate_dag(graph_config, seed=args.seed)
+
+    #  print nodes into CSV file
+    with open('{}_nodes'.format(args.filename), 'w') as out:
+        out.write('nodeId:ID(Node),name,topological_index:Int,:LABEL\n')
+        for node_id, layer in nodes:
+            out.write('{0},{1}{0},{0},{1}\n'.format(node_id, layer))
+
+    # print edges into CSV file
+    with open('{}_edges'.format(args.filename), 'w') as out:
+        out.write(':START_ID(Node),:END_ID(Node),:TYPE\n')
+        for u, v in edges:
+            out.write('{},{},child\n'.format(u, v))
diff --git a/customers/elliott/graph1.json b/customers/elliott/graph1.json
new file mode 100644
index 000000000..fbdc155ef
--- /dev/null
+++ b/customers/elliott/graph1.json
@@ -0,0 +1,39 @@
+{
+    "layers": [
+        { 
+            "name": "Root",
+            "sublayers": 1,
+            "degree_lo": 1,
+            "degree_hi": 10,
+            "nodes": 20000
+        },
+        { 
+            "name": "Category",
+            "sublayers": 5,
+            "degree_lo": 1,
+            "degree_hi": 10,
+            "nodes": 20000
+        },
+        {   
+            "name": "Group",
+            "sublayers": 1,
+            "degree_lo": 20,
+            "degree_hi": 40,
+            "nodes": 20000
+        },
+        {
+            "name": "CustomGroup",
+            "sublayers": 15,
+            "degree_lo": 20,
+            "degree_hi": 40,
+            "nodes": 20000
+        },
+        { 
+            "name": "Leaf",
+            "sublayers": 1,
+            "degree_lo": 1,
+            "degree_hi": 1,
+            "nodes": 20000
+        }
+    ]
+}
diff --git a/customers/elliott/graph2.json b/customers/elliott/graph2.json
new file mode 100644
index 000000000..6f67bdada
--- /dev/null
+++ b/customers/elliott/graph2.json
@@ -0,0 +1,39 @@
+{
+    "layers": [
+        { 
+            "name": "Root",
+            "sublayers": 1,
+            "degree_lo": 1,
+            "degree_hi": 10,
+            "nodes": 20000
+        },
+        { 
+            "name": "Category",
+            "sublayers": 5,
+            "degree_lo": 1,
+            "degree_hi": 10,
+            "nodes": 20000
+        },
+        {   
+            "name": "Group",
+            "sublayers": 1,
+            "degree_lo": 50,
+            "degree_hi": 100,
+            "nodes": 20000
+        },
+        {
+            "name": "CustomGroup",
+            "sublayers": 15,
+            "degree_lo": 50,
+            "degree_hi": 100,
+            "nodes": 20000
+        },
+        { 
+            "name": "Leaf",
+            "sublayers": 1,
+            "degree_lo": 50,
+            "degree_hi": 100,
+            "nodes": 20000
+        }
+    ]
+}
diff --git a/customers/elliott/graph3.json b/customers/elliott/graph3.json
new file mode 100644
index 000000000..2fe140b2f
--- /dev/null
+++ b/customers/elliott/graph3.json
@@ -0,0 +1,39 @@
+{
+    "layers": [
+        { 
+            "name": "Root",
+            "sublayers": 1,
+            "degree_lo": 10,
+            "degree_hi": 50,
+            "nodes": 20000
+        },
+        { 
+            "name": "Category",
+            "sublayers": 5,
+            "degree_lo": 10,
+            "degree_hi": 50,
+            "nodes": 20000
+        },
+        {   
+            "name": "Group",
+            "sublayers": 1,
+            "degree_lo": 50,
+            "degree_hi": 100,
+            "nodes": 20000
+        },
+        {
+            "name": "CustomGroup",
+            "sublayers": 15,
+            "degree_lo": 50,
+            "degree_hi": 100,
+            "nodes": 20000
+        },
+        { 
+            "name": "Leaf",
+            "sublayers": 1,
+            "degree_lo": 50,
+            "degree_hi": 100,
+            "nodes": 20000
+        }
+    ]
+}