diff --git a/customers/otto/connected_components_test.py b/customers/otto/connected_components_test.py new file mode 100644 index 000000000..ec802ae47 --- /dev/null +++ b/customers/otto/connected_components_test.py @@ -0,0 +1,118 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +""" +This script attempts to evaluate the feasibility of using Memgraph for +Otto group's usecase. The usecase is finding connected componentes in +a large, very sparse graph (cca 220M nodes, 250M edges), based on a dynamic +inclusion / exclusion of edges (w.r.t variable parameters and the source node +type). + +This implementation defines a random graph with the given number of nodes +and edges and looks for connected components using breadth-first expansion. +Edges are included / excluded based on a simple expression, only demonstrating +possible usage. +""" + +from argparse import ArgumentParser +import logging +from time import time +from collections import defaultdict +from math import log2 +from random import randint + +from neo4j.v1 import GraphDatabase + +log = logging.getLogger(__name__) + + +def generate_graph(sess, node_count, edge_count): + # An index that will speed-up edge creation. + sess.run("CREATE INDEX ON :Node(id)").consume() + + # Create the given number of nodes with a randomly selected type from: + # [0.5, 1.5, 2.5]. + sess.run(("UNWIND range(0, {} - 1) AS id CREATE " + "(:Node {{id: id, type: 0.5 + tointeger(rand() * 3)}})").format( + node_count)).consume() + + # Create the given number of edges, each with a 'value' property of + # a random [0, 3.0) float. Each edge connects two random nodes, so the + # expected node degree is (edge_count * 2 / node_count). Generate edges + # so the connectivity is non-uniform (to produce connected components of + # various sizes). + sess.run(("UNWIND range(0, {0} - 1) AS id WITH id " + "MATCH (from:Node {{id: tointeger(rand() * {1})}}), " + "(to:Node {{id: tointeger(rand() * {1} * id / {0})}}) " + "CREATE (from)-[:Edge {{value: 3 * rand()}}]->(to)").format( + edge_count, node_count)).consume() + + +def get_connected_ids(sess, node_id): + # Matches a node with the given ID and returns the IDs of all the nodes + # it is connected to. Note that within the BFS lambda expression there + # is an expression used to filter out edges expanded over. + return sess.run(( + "MATCH (from:Node {{id: {}}})-" + "[*bfs (e, n | abs(from.type - e.value) < 0.80)]-(d) " + "RETURN count(*) AS c").format(node_id)).data()[0]['c'] + + +def parse_args(): + parser = ArgumentParser(description=__doc__) + parser.add_argument('--endpoint', type=str, default='localhost:7687', + help='Memgraph instance endpoint. ') + parser.add_argument('--node-count', type=int, default=1000, + help='The number of nodes in the graph') + parser.add_argument('--edge-count', type=int, default=1000, + help='The number of edges in the graph') + parser.add_argument('--sample-count', type=int, default=None, + help='The number of samples to take') + return parser.parse_args() + + +def main(): + args = parse_args() + logging.basicConfig(level=logging.INFO) + log.info("Memgraph - Otto test database generator") + logging.getLogger("neo4j").setLevel(logging.WARNING) + + driver = GraphDatabase.driver( + 'bolt://' + args.endpoint, + auth=("ignored", "ignored"), + encrypted=False) + sess = driver.session() + + sess.run("MATCH (n) DETACH DELETE n").consume() + + log.info("Generating graph with %s nodes and %s edges...", + args.node_count, args.edge_count) + generate_graph(sess, args.node_count, args.edge_count) + + # Track which vertices have been found as part of a component. + start_time = time() + max_query_time = 0 + log.info("Looking for connected components...") + # Histogram of log2 sizes of connected components found. + histogram = defaultdict(int) + sample_count = args.sample_count if args.sample_count else args.node_count + for i in range(sample_count): + node_id = randint(0, args.node_count - 1) + query_start_time = time() + log2_size = int(log2(1 + get_connected_ids(sess, node_id))) + max_query_time = max(max_query_time, time() - query_start_time) + histogram[log2_size] += 1 + elapsed = time() - start_time + log.info("Connected components found in %.2f sec (avg %.2fms, max %.2fms)", + elapsed, elapsed / sample_count * 1000, max_query_time * 1000) + log.info("Component size histogram (count | range)") + for log2_size, count in sorted(histogram.items()): + log.info("\t%5d | %d - %d", count, 2 ** log2_size, + 2 ** (log2_size + 1) - 1) + + sess.close() + driver.close() + + +if __name__ == '__main__': + main()