0681040395
Summary: Recover only snapshot Return recovery info on worker recovery Update tests Start wal recovery Single node wal split Keep track of wal possible to recover Fix comment Wal tx intersection Merge branch 'master' into sync_wal_tx Reviewers: buda, ipaljak, dgleich, vkasljevic, teon.banek Reviewed By: teon.banek Subscribers: pullbot Differential Revision: https://phabricator.memgraph.io/D1489
224 lines
7.8 KiB
Python
224 lines
7.8 KiB
Python
import json
|
|
import os
|
|
import time
|
|
|
|
# to change the size of the cluster, just change this parameter
|
|
NUM_MACHINES = 3
|
|
|
|
# test setup
|
|
SCENARIOS = ["point_lookup", "create_tx"]
|
|
DURATION = 300
|
|
WORKERS = 6
|
|
|
|
# constants
|
|
SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__))
|
|
MEMGRAPH_BINARY = "memgraph"
|
|
CLIENT_BINARY = "tests/macro_benchmark/card_fraud_client"
|
|
BINARIES = [MEMGRAPH_BINARY, CLIENT_BINARY]
|
|
|
|
# wrappers
|
|
class WorkerWrapper:
|
|
def __init__(self, address, worker):
|
|
self._address = address
|
|
self._worker = worker
|
|
self._tid = worker.get_jail()
|
|
|
|
def get_address(self):
|
|
return self._address
|
|
|
|
def __getattr__(self, name):
|
|
if name in ["allocate_file", "read_file", "store_label"]:
|
|
return getattr(self._worker, name)
|
|
def func(*args, **kwargs):
|
|
args = [self._tid] + list(args)
|
|
return getattr(self._worker, name)(*args, **kwargs)
|
|
return func
|
|
|
|
class MgCluster:
|
|
def __init__(self, machine_ids, workers):
|
|
# create wrappers
|
|
self._master = WorkerWrapper(os.environ[machine_ids[0]],
|
|
workers[machine_ids[0]])
|
|
self._workers = []
|
|
for machine_id in machine_ids[1:]:
|
|
self._workers.append(WorkerWrapper(os.environ[machine_id],
|
|
workers[machine_id]))
|
|
|
|
def start(self):
|
|
# start memgraph master
|
|
self._master.start(MEMGRAPH_BINARY, [
|
|
"--master",
|
|
"--master-host", self._master.get_address(),
|
|
"--master-port", "10000",
|
|
"--durability-directory", os.path.join(SCRIPT_DIR, "snapshots",
|
|
"worker_0"),
|
|
"--db-recover-on-startup",
|
|
"--query-vertex-count-to-expand-existing", "-1",
|
|
"--num-workers", str(WORKERS),
|
|
"--rpc-num-workers", str(WORKERS),
|
|
"--recovering-cluster-size", str(len(self._workers) + 1)
|
|
])
|
|
|
|
# sleep to allow the master to startup
|
|
time.sleep(5)
|
|
|
|
# start memgraph workers
|
|
for i, worker in enumerate(self._workers, start=1):
|
|
worker.start(MEMGRAPH_BINARY, [
|
|
"--worker", "--worker-id", str(i),
|
|
"--worker-host", worker.get_address(),
|
|
"--worker-port", str(10000 + i),
|
|
"--master-host", self._master.get_address(),
|
|
"--master-port", "10000",
|
|
"--durability-directory", os.path.join(SCRIPT_DIR, "snapshots",
|
|
"worker_" + str(i)),
|
|
"--db-recover-on-startup",
|
|
"--num-workers", str(WORKERS),
|
|
"--rpc-num-workers", str(WORKERS),
|
|
])
|
|
|
|
# sleep to allow the workers to startup
|
|
time.sleep(15)
|
|
|
|
# store initial usage
|
|
self._usage_start = [self._master.get_usage()]
|
|
for worker in self._workers:
|
|
self._usage_start.append(worker.get_usage())
|
|
self._usage_start_time = time.time()
|
|
|
|
def get_master_address(self):
|
|
return self._master.get_address()
|
|
|
|
def check_status(self):
|
|
if not self._master.check_status():
|
|
return False
|
|
for worker in self._workers:
|
|
if not worker.check_status():
|
|
return False
|
|
return True
|
|
|
|
def stop(self):
|
|
# store final usage
|
|
self._usage_stop = [self._master.get_usage()]
|
|
for worker in self._workers:
|
|
self._usage_stop.append(worker.get_usage())
|
|
self._usage_stop_time = time.time()
|
|
|
|
# stop the master
|
|
self._master.stop()
|
|
|
|
# wait to allow the master and workers to die
|
|
time.sleep(5)
|
|
|
|
# stop the workers
|
|
for worker in self._workers:
|
|
worker.stop()
|
|
|
|
# wait to allow the workers to die
|
|
time.sleep(5)
|
|
|
|
def get_usage(self):
|
|
ret = []
|
|
tdelta = self._usage_stop_time - self._usage_start_time
|
|
for val_start, val_stop in zip(self._usage_start, self._usage_stop):
|
|
data = {
|
|
"cpu": (val_stop["cpu"] - val_start["cpu"]) / tdelta,
|
|
"memory": val_stop["max_memory"] / 1024,
|
|
"threads": val_stop["max_threads"],
|
|
"network": {}
|
|
}
|
|
net_start = val_start["network"]["eth0"]
|
|
net_stop = val_stop["network"]["eth0"]
|
|
for i in ["bytes", "packets"]:
|
|
data["network"][i] = {}
|
|
for j in ["rx", "tx"]:
|
|
data["network"][i][j] = (net_stop[i][j] -
|
|
net_start[i][j]) / tdelta
|
|
ret.append(data)
|
|
return ret
|
|
|
|
def store_label(self, label):
|
|
self._master.store_label(label)
|
|
for worker in self._workers:
|
|
worker.store_label(label)
|
|
|
|
def write_scenario_summary(scenario, throughput, usage, output):
|
|
output.write("Scenario **{}** throughput !!{:.2f}!! queries/s.\n\n".format(
|
|
scenario, throughput))
|
|
headers = ["Memgraph", "CPU", "Max memory", "Max threads",
|
|
"Network RX", "Network TX"]
|
|
output.write("<table>\n<tr>")
|
|
for header in headers:
|
|
output.write("<th>{}</th>".format(header))
|
|
output.write("</tr>\n")
|
|
for i, current in enumerate(usage):
|
|
name = "master" if i == 0 else "worker" + str(i)
|
|
output.write("<tr><td>{}</td>".format(name))
|
|
for key, unit in [("cpu", "s/s"), ("memory", "MiB"), ("threads", "")]:
|
|
fmt = ".2f" if key != "threads" else ""
|
|
output.write(("<td>{:" + fmt + "} {}</td>").format(
|
|
current[key], unit).strip())
|
|
for key in ["rx", "tx"]:
|
|
output.write("<td>{:.2f} packets/s</td>".format(
|
|
current["network"]["packets"][key]))
|
|
output.write("</tr>\n")
|
|
output.write("</table>\n\n")
|
|
|
|
# main test function
|
|
def run(machine_ids, workers):
|
|
# create output directory
|
|
output_dir = os.path.join(SCRIPT_DIR, "output")
|
|
if not os.path.exists(output_dir):
|
|
os.mkdir(output_dir)
|
|
|
|
# create memgraph cluster and client
|
|
mg_cluster = MgCluster(machine_ids, workers)
|
|
mg_client = WorkerWrapper(os.environ[machine_ids[0]],
|
|
workers[machine_ids[0]])
|
|
|
|
# execute the tests
|
|
stats = {}
|
|
for scenario in SCENARIOS:
|
|
output_file = os.path.join(output_dir, scenario + ".json")
|
|
|
|
print("Starting memgraph cluster")
|
|
mg_cluster.store_label("Start: cluster")
|
|
mg_cluster.start()
|
|
|
|
print("Starting client scenario:", scenario)
|
|
mg_cluster.store_label("Start: " + scenario)
|
|
mg_client.start(CLIENT_BINARY, [
|
|
"--address", mg_cluster.get_master_address(),
|
|
"--group", "card_fraud",
|
|
"--scenario", scenario,
|
|
"--duration", str(DURATION),
|
|
"--num-workers", str(WORKERS),
|
|
"--output", output_file,
|
|
])
|
|
|
|
# wait for the client to terminate and check the cluster status
|
|
while mg_client.check_status():
|
|
assert mg_cluster.check_status(), "The memgraph cluster has died!"
|
|
time.sleep(2)
|
|
|
|
# stop everything
|
|
mg_client.wait()
|
|
mg_cluster.store_label("Stop: " + scenario)
|
|
mg_cluster.stop()
|
|
mg_cluster.store_label("Stop: cluster")
|
|
|
|
# process the stats
|
|
data = json.loads(list(filter(lambda x: x.strip(),
|
|
open(output_file).read().split("\n")))[-1])
|
|
throughput = data["num_executed_queries"] / data["elapsed_time"]
|
|
usage = mg_cluster.get_usage()
|
|
stats[scenario] = (throughput, usage)
|
|
|
|
# dump the stats
|
|
stats_file = open(os.path.join(output_dir, ".card_fraud_summary"), "w")
|
|
stats_file.write("==== Distributed card fraud summary: ====\n\n")
|
|
for scenario in SCENARIOS:
|
|
throughput, usage = stats[scenario]
|
|
write_scenario_summary(scenario, throughput, usage, stats_file)
|
|
stats_file.close()
|