memgraph/customers/elliott/generate_dag

126 lines
3.9 KiB
Plaintext
Raw Normal View History

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""Generates a DAG from JSON spec in [config] and outputs nodes to
[filename]_nodes, and edges to [filename]_edges in format convertible
to Memgraph snapshot.
Here's an example JSON spec:
{
"layers": [
{
"name": "A",
"sublayers": 1,
"degree_lo": 1,
"degree_hi": 3,
"nodes": 4
},
{
"name": "B",
"sublayers": 3,
"degree_lo": 2,
"degree_hi": 3,
"nodes": 10
},
{
"name": "C",
"sublayers": 1,
"degree_lo": 1,
"degree_hi": 1,
"nodes": 5
}
]
}
Nodes from each layer will be randomly divided into sublayers. A node can
only have edges pointing to nodes in lower sublayers of the same layer, or
to nodes from the layer directly below it. Out-degree is chosen uniformly
random from [degree_lo, degree_hi] interval."""
import argparse
from itertools import accumulate
import json
import random
def _split_into_sum(n, k):
assert 1 <= n, "n should be at least 1"
assert k <= n, "k shouldn't be greater than n"
xs = [0] + sorted(random.sample(range(1, n), k-1)) + [n]
return [b - a for a, b in zip(xs, xs[1:])]
def generate_dag(graph_config, seed=None):
random.seed(seed)
nodes = []
edges = []
layer_lo = 1
for layer in graph_config:
sublayers = _split_into_sum(layer['nodes'], layer['sublayers'])
sub_range = accumulate([layer_lo] + sublayers)
layer['sublayer_range'] = list(sub_range)
nodes.extend([
(u, layer['name'])
for u in range(layer_lo, layer_lo + layer['nodes'])
])
layer_lo += layer['nodes']
edges = []
for layer, next_layer in zip(graph_config, graph_config[1:]):
degree_lo = layer['degree_lo']
degree_hi = layer['degree_hi']
sub_range = layer['sublayer_range']
sub_range_next = next_layer['sublayer_range']
layer_lo = sub_range[0]
next_layer_hi = sub_range_next[-1]
for sub_lo, sub_hi in zip(sub_range, sub_range[1:]):
for u in range(sub_lo, sub_hi):
num_edges = random.randint(degree_lo, degree_hi)
for _ in range(num_edges):
v = random.randint(sub_hi, next_layer_hi - 1)
edges.append((u, v))
for sub_lo, sub_hi in zip(sub_range_next, sub_range_next[1:]):
for u in range(sub_lo, sub_hi):
v = random.randint(layer_lo, sub_lo - 1)
edges.append((v, u))
return nodes, edges
if __name__ == '__main__':
parser = argparse.ArgumentParser(
formatter_class=argparse.RawDescriptionHelpFormatter,
description=__doc__)
parser.add_argument('config', type=str, help='graph config JSON file')
parser.add_argument('filename', type=str,
help='nodes will be stored to filename_nodes, '
'edges to filename_edges')
parser.add_argument('--seed', type=int,
help='seed for the random generator (default = '
'current system time)')
args = parser.parse_args()
with open(args.config, 'r') as f:
graph_config = json.loads(f.read())['layers']
nodes, edges = generate_dag(graph_config, seed=args.seed)
# print nodes into CSV file
with open('{}_nodes'.format(args.filename), 'w') as out:
out.write('nodeId:ID(Node),name,topological_index:Int,:LABEL\n')
for node_id, layer in nodes:
out.write('{0},{1}{0},{0},{1}\n'.format(node_id, layer))
# print edges into CSV file
with open('{}_edges'.format(args.filename), 'w') as out:
out.write(':START_ID(Node),:END_ID(Node),:TYPE\n')
for u, v in edges:
out.write('{},{},child\n'.format(u, v))