2017-06-14 19:42:06 +08:00
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
A long running test that performs
random CRUD ops on a bolt database.
Parameterized with vertex and edge counts around which
the graph state oscilates.
import logging
2017-08-16 21:40:05 +08:00
import multiprocessing
import neo4j.exceptions
2017-06-14 19:42:06 +08:00
import random
import time
from collections import defaultdict
import common
2017-08-16 21:40:05 +08:00
2017-06-14 19:42:06 +08:00
log = logging.getLogger(__name__)
2017-08-16 21:40:05 +08:00
INDEX_FORMAT = "indexed_label{}"
2017-08-02 21:47:35 +08:00
2017-06-14 19:42:06 +08:00
2017-08-16 21:40:05 +08:00
def random_element(lst):
return lst[random.randint(0, len(lst) - 1)]
2017-06-14 19:42:06 +08:00
def bernoulli(p):
return random.random() < p
class Graph():
Exposes functions for working on a graph, and tracks some
statistics about graph state.
def __init__(self, vertex_count, edge_count, labels=5):
vertex_count - int, desired vertex count
edge_count - int, desired edge count
labels - int, the number of labels to use
# desired vertex and edge counts
self.vertex_count = vertex_count
self.edge_count = edge_count
# storage
2017-08-16 21:40:05 +08:00
self.edges = []
self.vertices = []
self.labels = {"label%d" % i: [] for i in range(labels)}
2017-06-14 19:42:06 +08:00
# info about query failures, maps exception string representations into
# occurence counts
self._query_failure_counts = defaultdict(int)
def add_query_failure(self, reason):
2017-08-16 21:40:05 +08:00
self._query_failure_counts[reason] += 1
2017-06-14 19:42:06 +08:00
def query_failures(self):
2017-08-16 21:40:05 +08:00
return dict(self._query_failure_counts)
2017-06-14 19:42:06 +08:00
class GraphSession():
Encapsulates a Graph and a Bolt session and provides CRUD op functions.
Also defines a run-loop for a generic exectutor, and a graph state
verification function.
2017-08-16 21:40:05 +08:00
def __init__(self, sid, graph, session):
self.sid = sid
# the label in the database that is indexed
# used for matching vertices faster
self.indexed_label = INDEX_FORMAT.format(sid)
self.vertex_id = 1
self.edge_id = 1
2017-06-14 19:42:06 +08:00
self.graph = graph
self.session = session
2017-08-16 21:40:05 +08:00
self.executed_queries = 0
2017-06-14 19:42:06 +08:00
self._start_time = time.time()
def v(self):
return self.graph.vertices
def e(self):
return self.graph.edges
2017-08-16 21:40:05 +08:00
def execute(self, query):
log.debug("Runner %d executing query: %s", self.sid, query)
self.executed_queries += 1
2017-06-14 19:42:06 +08:00
return self.session.run(query).data()
2017-08-16 21:40:05 +08:00
except neo4j.exceptions.ServiceUnavailable as e:
raise e
2017-06-14 19:42:06 +08:00
except Exception as e:
return None
2017-08-16 21:40:05 +08:00
def create_vertices(self, vertices_count):
query = ""
if vertices_count == 0: return
for _ in range(vertices_count):
query += "CREATE (:%s {id: %r}) " % (self.indexed_label,
self.vertex_id += 1
2017-06-14 19:42:06 +08:00
def remove_vertex(self):
2017-08-16 21:40:05 +08:00
vertex_id = random_element(self.v)
2017-06-14 19:42:06 +08:00
result = self.execute(
2017-08-02 21:47:35 +08:00
"MATCH (n:%s {id: %r}) OPTIONAL MATCH (n)-[r]-() "
2017-08-16 21:40:05 +08:00
"DETACH DELETE n RETURN n.id, labels(n), r.id" %
(self.indexed_label, vertex_id))
2017-06-14 19:42:06 +08:00
if result:
process_vertex_ids = set()
for row in result:
# remove vertex but note there could be duplicates
vertex_id = row['n.id']
if vertex_id not in process_vertex_ids:
for label in row['labels(n)']:
2017-08-16 21:40:05 +08:00
if (label != self.indexed_label):
2017-08-02 21:47:35 +08:00
2017-06-14 19:42:06 +08:00
# remove edge
edge_id = row['r.id']
2017-08-16 21:40:05 +08:00
if edge_id != None:
2017-06-14 19:42:06 +08:00
def create_edge(self):
creation = self.execute(
2017-08-02 21:47:35 +08:00
"MATCH (from:%s {id: %r}), (to:%s {id: %r}) "
2017-06-14 19:42:06 +08:00
"CREATE (from)-[e:EdgeType {id: %r}]->(to) RETURN e" % (
2017-08-16 21:40:05 +08:00
self.indexed_label, random_element(self.v), self.indexed_label,
random_element(self.v), self.edge_id))
2017-06-14 19:42:06 +08:00
if creation:
2017-08-16 21:40:05 +08:00
self.edge_id += 1
2017-06-14 19:42:06 +08:00
def remove_edge(self):
2017-08-16 21:40:05 +08:00
edge_id = random_element(self.e)
result = self.execute("MATCH (:%s)-[e {id: %r}]->(:%s) DELETE e "
"RETURN e.id" % (self.indexed_label, edge_id,
2017-06-14 19:42:06 +08:00
if result:
def add_label(self):
2017-08-16 21:40:05 +08:00
vertex_id = random_element(self.v)
2017-06-14 19:42:06 +08:00
label = random.choice(list(self.graph.labels.keys()))
# add a label on a vertex that didn't have that label
# yet (we need that for book-keeping)
2017-08-16 21:40:05 +08:00
result = self.execute("MATCH (v:%s {id: %r}) WHERE not v:%s SET v:%s "
"RETURN v.id" % (self.indexed_label, vertex_id,
label, label))
2017-06-14 19:42:06 +08:00
if result:
2017-08-16 21:40:05 +08:00
def update_global_vertices(self):
lo = random.randint(0, self.vertex_id)
hi = lo + int(self.vertex_id * 0.01)
num = random.randint(0, 2 ** 20)
self.execute("MATCH (n) WHERE n.id > %d AND n.id < %d "
"SET n.value = %d" % (lo, hi, num))
def update_global_edges(self):
lo = random.randint(0, self.edge_id)
hi = lo + int(self.edge_id * 0.01)
num = random.randint(0, 2 ** 20)
self.execute("MATCH ()-[e]->() WHERE e.id > %d AND e.id < %d "
"SET e.value = %d" % (lo, hi, num))
2017-06-14 19:42:06 +08:00
def verify_graph(self):
""" Checks if the local info corresponds to DB state """
2017-08-16 21:40:05 +08:00
def test(obj, length, message):
assert len(obj) == length, message % (len(obj), length)
2017-06-14 19:42:06 +08:00
def get(query, key):
2017-08-16 21:40:05 +08:00
ret = self.execute(query)
assert ret != None, "Query '{}' returned 'None'!".format(query)
return [row[key] for row in ret]
test(self.v, get("MATCH (n:{}) RETURN count(n)".format(
self.indexed_label), "count(n)")[0],
"Expected %d vertices, found %d")
test(self.e, get("MATCH (:{0})-[r]->(:{0}) RETURN count(r)".format(
self.indexed_label), "count(r)")[0],
"Expected %d edges, found %d")
for lab, exp in self.graph.labels.items():
test(exp, get("MATCH (n:%s:%s) RETURN count(n)" % (
self.indexed_label, lab), "count(n)")[0],
"Expected %d vertices with label '{}', found %d".format(
log.info("Runner %d graph verification success:", self.sid)
log.info("\tExecuted %d queries in %.2f seconds",
self.executed_queries, time.time() - self._start_time)
log.info("\tGraph has %d vertices and %d edges",
len(self.v), len(self.e))
for label in sorted(self.graph.labels.keys()):
log.info("\tVertices with label '%s': %d",
label, len(self.graph.labels[label]))
failures = self.graph.query_failures()
if failures:
log.info("\tQuery failed (reason: count)")
for reason, count in failures.items():
log.info("\t\t'%s': %d", reason, count)
def run_loop(self, vertex_batch, query_count, max_time, verify):
# start the test
start_time = last_verify = time.time()
# initial batched vertex creation
for _ in range(self.graph.vertex_count // vertex_batch):
if (time.time() - start_time) / 60 > max_time \
or self.executed_queries > query_count:
self.create_vertices(self.graph.vertex_count % vertex_batch)
# run rest
while self.executed_queries < query_count:
now_time = time.time()
if (now_time - start_time) / 60 > max_time:
2017-06-14 19:42:06 +08:00
2017-08-16 21:40:05 +08:00
if verify > 0 and (now_time - last_verify) > verify:
last_verify = now_time
2017-06-14 19:42:06 +08:00
ratio_e = len(self.e) / self.graph.edge_count
ratio_v = len(self.v) / self.graph.vertex_count
2017-08-16 21:40:05 +08:00
# try to edit vertices globally
if bernoulli(0.01):
# try to edit edges globally
if bernoulli(0.01):
2017-06-14 19:42:06 +08:00
# prefer adding/removing edges whenever there is an edge
# disbalance and there is enough vertices
if ratio_v > 0.5 and abs(1 - ratio_e) > 0.2:
if bernoulli(ratio_e / 2.0):
# if we are near vertex balance, we can also do updates
# instad of update / deletes
if abs(1 - ratio_v) < 0.5 and bernoulli(0.5):
if bernoulli(ratio_v / 2.0):
2017-08-16 21:40:05 +08:00
def runner(params):
num, args = params
driver = common.argument_driver(args)
2017-08-18 20:48:21 +08:00
graph = Graph(args.vertex_count // args.worker_count,
args.edge_count // args.worker_count)
2017-08-16 21:40:05 +08:00
log.info("Starting query runner process")
session = GraphSession(num, graph, driver.session())
2017-08-18 20:48:21 +08:00
session.run_loop(args.vertex_batch, args.max_queries // args.worker_count,
2017-08-16 21:40:05 +08:00
args.max_time, args.verify)
log.info("Runner %d executed %d queries", num, session.executed_queries)
2017-06-14 19:42:06 +08:00
def parse_args():
argp = common.connection_argument_parser()
argp.add_argument("--logging", default="INFO",
choices=["INFO", "DEBUG", "WARNING", "ERROR"],
help="Logging level")
argp.add_argument("--vertex-count", type=int, required=True,
help="The average number of vertices in the graph")
argp.add_argument("--edge-count", type=int, required=True,
help="The average number of edges in the graph")
2017-08-16 21:40:05 +08:00
argp.add_argument("--vertex-batch", type=int, default=200,
help="The number of vertices to be created "
2017-06-14 19:42:06 +08:00
argp.add_argument("--prop-count", type=int, default=5,
help="The max number of properties on a node")
argp.add_argument("--max-queries", type=int, default=2 ** 30,
help="Maximum number of queries to execute")
argp.add_argument("--max-time", type=int, default=2 ** 30,
help="Maximum execution time in minutes")
argp.add_argument("--verify", type=int, default=0,
help="Interval (seconds) between checking local info")
2017-08-18 20:48:21 +08:00
argp.add_argument("--worker-count", type=int, default=1,
help="The number of workers that operate on the graph "
2017-06-14 19:42:06 +08:00
return argp.parse_args()
def main():
args = parse_args()
if args.logging:
log.info("Starting Memgraph long running test")
2017-08-16 21:40:05 +08:00
# cleanup and create indexes
2017-06-14 19:42:06 +08:00
driver = common.argument_driver(args)
driver.session().run("MATCH (n) DETACH DELETE n").consume()
2017-08-18 20:48:21 +08:00
for i in range(args.worker_count):
2017-08-16 21:40:05 +08:00
label = INDEX_FORMAT.format(i)
driver.session().run("CREATE INDEX ON :%s(id)" % label).consume()
2017-06-14 19:42:06 +08:00
2017-08-16 21:40:05 +08:00
2017-08-18 20:48:21 +08:00
params = [(i, args) for i in range(args.worker_count)]
with multiprocessing.Pool(args.worker_count) as p:
2017-08-16 21:40:05 +08:00
p.map(runner, params, 1)
2017-06-14 19:42:06 +08:00
log.info("All query runners done")
if __name__ == '__main__':