From d62feb56fc5a6174c10cb1105a5f9409b5214393 Mon Sep 17 00:00:00 2001 From: Marin Tomic <marin.tomic@memgraph.io> Date: Wed, 25 Oct 2017 18:02:44 +0200 Subject: [PATCH] Some tests for the Elliott Management use case. Reviewers: buda, teon.banek Reviewed By: teon.banek Subscribers: pullbot Differential Revision: https://phabricator.memgraph.io/D924 --- customers/elliott/README.md | 70 ++++++++++++++++++ customers/elliott/commands.txt | 12 ++++ customers/elliott/generate_dag | 125 +++++++++++++++++++++++++++++++++ customers/elliott/graph1.json | 39 ++++++++++ customers/elliott/graph2.json | 39 ++++++++++ customers/elliott/graph3.json | 39 ++++++++++ 6 files changed, 324 insertions(+) create mode 100644 customers/elliott/README.md create mode 100644 customers/elliott/commands.txt create mode 100755 customers/elliott/generate_dag create mode 100644 customers/elliott/graph1.json create mode 100644 customers/elliott/graph2.json create mode 100644 customers/elliott/graph3.json diff --git a/customers/elliott/README.md b/customers/elliott/README.md new file mode 100644 index 000000000..cbddc4864 --- /dev/null +++ b/customers/elliott/README.md @@ -0,0 +1,70 @@ +DISCLAIMER: this is just an initial test, graph might not resemble +the graph in the use case at all and the data might be completely +irrelevant. + +We tried generating a few sample graphs from the vague description +given in the use case doc. Then we tried writing queries that would +solve the problem of updating nodes when a leaf value changes, +assuming all the internal nodes compute only the sum function. + +We start by creating an index on `id` property to improve initial lookup +performance: + + CREATE INDEX ON :Leaf(id) + +Set values of all leafs to 1: + + MATCH (u:Leaf) SET u.value = 1 + +Now we initialize the values of all other nodes in the graph: + + MATCH (u) WHERE NOT u:Leaf SET u.value = 0 + + MATCH (u) WITH u + ORDER BY u.topological_index DESC + MATCH (u)-->(v) SET u.value = u.value + v.value + +Change the value of a leaf: + + MATCH (u:Leaf {id: "9"}) SET u.value = 10 + +We have to reset all the updated nodes to a neutral element: + + MATCH (u:Leaf {id: "18"})<-[* bfs]-(v) + WHERE NOT v:Leaf SET v.value = 0 + +Finally, we recalculate their values in topological order: + + MATCH (u:Leaf {id: "18"})<-[* bfs]-(v) + WITH v ORDER BY v.topological_index DESC + MATCH (v)-->(w) SET v.value = v.value + w.value + +There are a few assumptions made worth pointing out. + +* We are able to efficiently maintain topological order + of vertices in the graph. + +* It is possible to accumulate the value of the function. Formally: + $$f(x_1, x_2, ..., x_n) = g(...(g(g(x_1, x_2), x_3), ...), x_n).$$ + +* There is a neutral element for the operation. However, this + assumption can be dropped by introducing an artificial neutral element. + +Number of operations required is proportional to sum of degrees of affected +nodes. + +We generated graph with $10^5$ nodes ($20\ 000$ nodes in each layer), varied the +degree distribution in node layers and measured time for the query to execute: + +| # | Root-Category-Group degree | Group-CustomGroup-Leaf degree | Time | +|:-:|:---------------------------:|:-----------------------------:|:---------:| +| 1 | [1, 10] | [20, 40] | ~1.1s | +| 2 | [1, 10] | [50, 100] | ~2.5s | +| 3 | [10, 50] | [50, 100] | ~3.3s | + +Due to the structure of the graph, update of a leaf required update of almost +all the nodes in the graph so we don't show times required for initial graph +update and update after leaf change separately. + +However, there is not enough info on the use case to make the test more +sophisticated. diff --git a/customers/elliott/commands.txt b/customers/elliott/commands.txt new file mode 100644 index 000000000..cd219ac31 --- /dev/null +++ b/customers/elliott/commands.txt @@ -0,0 +1,12 @@ +CREATE INDEX ON :Leaf(id); +MATCH (u:Leaf) SET u.value = 1; +MATCH (u) WHERE NOT u:Leaf SET u.value = 0; +MATCH (u) WITH u +ORDER BY u.topological_index DESC +MATCH (u)-->(v) SET u.value = u.value + v.value; +MATCH (u:Leaf {id: "85000"}) SET u.value = 10; +MATCH (u:Leaf {id: "85000"})<-[* bfs]-(v) +WHERE NOT v:Leaf SET v.value = 0; +MATCH (u:Leaf {id: "85000"})<-[* bfs]-(v) +WITH v ORDER BY v.topological_index DESC +MATCH (v)-->(w) SET v.value = v.value + w.value; diff --git a/customers/elliott/generate_dag b/customers/elliott/generate_dag new file mode 100755 index 000000000..2cdbb6d67 --- /dev/null +++ b/customers/elliott/generate_dag @@ -0,0 +1,125 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +"""Generates a DAG from JSON spec in [config] and outputs nodes to +[filename]_nodes, and edges to [filename]_edges in format convertible +to Memgraph snapshot. + +Here's an example JSON spec: + + { + "layers": [ + { + "name": "A", + "sublayers": 1, + "degree_lo": 1, + "degree_hi": 3, + "nodes": 4 + }, + { + "name": "B", + "sublayers": 3, + "degree_lo": 2, + "degree_hi": 3, + "nodes": 10 + }, + { + "name": "C", + "sublayers": 1, + "degree_lo": 1, + "degree_hi": 1, + "nodes": 5 + } + ] + } + +Nodes from each layer will be randomly divided into sublayers. A node can +only have edges pointing to nodes in lower sublayers of the same layer, or +to nodes from the layer directly below it. Out-degree is chosen uniformly +random from [degree_lo, degree_hi] interval.""" + +import argparse +from itertools import accumulate +import json +import random + + +def _split_into_sum(n, k): + assert 1 <= n, "n should be at least 1" + assert k <= n, "k shouldn't be greater than n" + xs = [0] + sorted(random.sample(range(1, n), k-1)) + [n] + return [b - a for a, b in zip(xs, xs[1:])] + + +def generate_dag(graph_config, seed=None): + random.seed(seed) + + nodes = [] + edges = [] + + layer_lo = 1 + for layer in graph_config: + sublayers = _split_into_sum(layer['nodes'], layer['sublayers']) + sub_range = accumulate([layer_lo] + sublayers) + layer['sublayer_range'] = list(sub_range) + nodes.extend([ + (u, layer['name']) + for u in range(layer_lo, layer_lo + layer['nodes']) + ]) + layer_lo += layer['nodes'] + + edges = [] + + for layer, next_layer in zip(graph_config, graph_config[1:]): + degree_lo = layer['degree_lo'] + degree_hi = layer['degree_hi'] + + sub_range = layer['sublayer_range'] + sub_range_next = next_layer['sublayer_range'] + + layer_lo = sub_range[0] + next_layer_hi = sub_range_next[-1] + + for sub_lo, sub_hi in zip(sub_range, sub_range[1:]): + for u in range(sub_lo, sub_hi): + num_edges = random.randint(degree_lo, degree_hi) + for _ in range(num_edges): + v = random.randint(sub_hi, next_layer_hi - 1) + edges.append((u, v)) + + for sub_lo, sub_hi in zip(sub_range_next, sub_range_next[1:]): + for u in range(sub_lo, sub_hi): + v = random.randint(layer_lo, sub_lo - 1) + edges.append((v, u)) + + return nodes, edges + + +if __name__ == '__main__': + parser = argparse.ArgumentParser( + formatter_class=argparse.RawDescriptionHelpFormatter, + description=__doc__) + parser.add_argument('config', type=str, help='graph config JSON file') + parser.add_argument('filename', type=str, + help='nodes will be stored to filename_nodes, ' + 'edges to filename_edges') + parser.add_argument('--seed', type=int, + help='seed for the random generator (default = ' + 'current system time)') + args = parser.parse_args() + + with open(args.config, 'r') as f: + graph_config = json.loads(f.read())['layers'] + + nodes, edges = generate_dag(graph_config, seed=args.seed) + + # print nodes into CSV file + with open('{}_nodes'.format(args.filename), 'w') as out: + out.write('nodeId:ID(Node),name,topological_index:Int,:LABEL\n') + for node_id, layer in nodes: + out.write('{0},{1}{0},{0},{1}\n'.format(node_id, layer)) + + # print edges into CSV file + with open('{}_edges'.format(args.filename), 'w') as out: + out.write(':START_ID(Node),:END_ID(Node),:TYPE\n') + for u, v in edges: + out.write('{},{},child\n'.format(u, v)) diff --git a/customers/elliott/graph1.json b/customers/elliott/graph1.json new file mode 100644 index 000000000..fbdc155ef --- /dev/null +++ b/customers/elliott/graph1.json @@ -0,0 +1,39 @@ +{ + "layers": [ + { + "name": "Root", + "sublayers": 1, + "degree_lo": 1, + "degree_hi": 10, + "nodes": 20000 + }, + { + "name": "Category", + "sublayers": 5, + "degree_lo": 1, + "degree_hi": 10, + "nodes": 20000 + }, + { + "name": "Group", + "sublayers": 1, + "degree_lo": 20, + "degree_hi": 40, + "nodes": 20000 + }, + { + "name": "CustomGroup", + "sublayers": 15, + "degree_lo": 20, + "degree_hi": 40, + "nodes": 20000 + }, + { + "name": "Leaf", + "sublayers": 1, + "degree_lo": 1, + "degree_hi": 1, + "nodes": 20000 + } + ] +} diff --git a/customers/elliott/graph2.json b/customers/elliott/graph2.json new file mode 100644 index 000000000..6f67bdada --- /dev/null +++ b/customers/elliott/graph2.json @@ -0,0 +1,39 @@ +{ + "layers": [ + { + "name": "Root", + "sublayers": 1, + "degree_lo": 1, + "degree_hi": 10, + "nodes": 20000 + }, + { + "name": "Category", + "sublayers": 5, + "degree_lo": 1, + "degree_hi": 10, + "nodes": 20000 + }, + { + "name": "Group", + "sublayers": 1, + "degree_lo": 50, + "degree_hi": 100, + "nodes": 20000 + }, + { + "name": "CustomGroup", + "sublayers": 15, + "degree_lo": 50, + "degree_hi": 100, + "nodes": 20000 + }, + { + "name": "Leaf", + "sublayers": 1, + "degree_lo": 50, + "degree_hi": 100, + "nodes": 20000 + } + ] +} diff --git a/customers/elliott/graph3.json b/customers/elliott/graph3.json new file mode 100644 index 000000000..2fe140b2f --- /dev/null +++ b/customers/elliott/graph3.json @@ -0,0 +1,39 @@ +{ + "layers": [ + { + "name": "Root", + "sublayers": 1, + "degree_lo": 10, + "degree_hi": 50, + "nodes": 20000 + }, + { + "name": "Category", + "sublayers": 5, + "degree_lo": 10, + "degree_hi": 50, + "nodes": 20000 + }, + { + "name": "Group", + "sublayers": 1, + "degree_lo": 50, + "degree_hi": 100, + "nodes": 20000 + }, + { + "name": "CustomGroup", + "sublayers": 15, + "degree_lo": 50, + "degree_hi": 100, + "nodes": 20000 + }, + { + "name": "Leaf", + "sublayers": 1, + "degree_lo": 50, + "degree_hi": 100, + "nodes": 20000 + } + ] +}