#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ This script attempts to evaluate the feasibility of using Memgraph for Otto group's usecase. The usecase is finding connected componentes in a large, very sparse graph (cca 220M nodes, 250M edges), based on a dynamic inclusion / exclusion of edges (w.r.t variable parameters and the source node type). This implementation defines a random graph with the given number of nodes and edges and looks for connected components using breadth-first expansion. Edges are included / excluded based on a simple expression, only demonstrating possible usage. """ from argparse import ArgumentParser import logging from time import time from collections import defaultdict from math import log2 from random import randint from neo4j.v1 import GraphDatabase log = logging.getLogger(__name__) def generate_graph(sess, node_count, edge_count): # An index that will speed-up edge creation. sess.run("CREATE INDEX ON :Node(id)").consume() # Create the given number of nodes with a randomly selected type from: # [0.5, 1.5, 2.5]. sess.run(("UNWIND range(0, {} - 1) AS id CREATE " "(:Node {{id: id, type: 0.5 + tointeger(rand() * 3)}})").format( node_count)).consume() # Create the given number of edges, each with a 'value' property of # a random [0, 3.0) float. Each edge connects two random nodes, so the # expected node degree is (edge_count * 2 / node_count). Generate edges # so the connectivity is non-uniform (to produce connected components of # various sizes). sess.run(("UNWIND range(0, {0} - 1) AS id WITH id " "MATCH (from:Node {{id: tointeger(rand() * {1})}}), " "(to:Node {{id: tointeger(rand() * {1} * id / {0})}}) " "CREATE (from)-[:Edge {{value: 3 * rand()}}]->(to)").format( edge_count, node_count)).consume() def get_connected_ids(sess, node_id): # Matches a node with the given ID and returns the IDs of all the nodes # it is connected to. Note that within the BFS lambda expression there # is an expression used to filter out edges expanded over. return sess.run(( "MATCH (from:Node {{id: {}}})-" "[*bfs (e, n | abs(from.type - e.value) < 0.80)]-(d) " "RETURN count(*) AS c").format(node_id)).data()[0]['c'] def parse_args(): parser = ArgumentParser(description=__doc__) parser.add_argument('--endpoint', type=str, default='localhost:7687', help='Memgraph instance endpoint. ') parser.add_argument('--node-count', type=int, default=1000, help='The number of nodes in the graph') parser.add_argument('--edge-count', type=int, default=1000, help='The number of edges in the graph') parser.add_argument('--sample-count', type=int, default=None, help='The number of samples to take') return parser.parse_args() def main(): args = parse_args() logging.basicConfig(level=logging.INFO) log.info("Memgraph - Otto test database generator") logging.getLogger("neo4j").setLevel(logging.WARNING) driver = GraphDatabase.driver( 'bolt://' + args.endpoint, auth=("ignored", "ignored"), encrypted=False) sess = driver.session() sess.run("MATCH (n) DETACH DELETE n").consume() log.info("Generating graph with %s nodes and %s edges...", args.node_count, args.edge_count) generate_graph(sess, args.node_count, args.edge_count) # Track which vertices have been found as part of a component. start_time = time() max_query_time = 0 log.info("Looking for connected components...") # Histogram of log2 sizes of connected components found. histogram = defaultdict(int) sample_count = args.sample_count if args.sample_count else args.node_count for i in range(sample_count): node_id = randint(0, args.node_count - 1) query_start_time = time() log2_size = int(log2(1 + get_connected_ids(sess, node_id))) max_query_time = max(max_query_time, time() - query_start_time) histogram[log2_size] += 1 elapsed = time() - start_time log.info("Connected components found in %.2f sec (avg %.2fms, max %.2fms)", elapsed, elapsed / sample_count * 1000, max_query_time * 1000) log.info("Component size histogram (count | range)") for log2_size, count in sorted(histogram.items()): log.info("\t%5d | %d - %d", count, 2 ** log2_size, 2 ** (log2_size + 1) - 1) sess.close() driver.close() if __name__ == '__main__': main()