memgraph/tools/manage_distributed_card_fraud
Marko Culinovic a997d7a15b Modify kill and clean operations
Summary:
Clean operations are executed in parallel, kill command now
kills memgraph on all machines explicitly.

Reviewers: mtomic

Reviewed By: mtomic

Subscribers: pullbot

Differential Revision: https://phabricator.memgraph.io/D1259
2018-03-05 10:00:43 +01:00

503 lines
21 KiB
Python
Executable File

#!/usr/bin/env python3
import concurrent.futures
import itertools
import json
import logging
import multiprocessing
import os
import subprocess
from argparse import ArgumentParser
from collections import namedtuple
from pathlib import Path
"""
Script provides helper actions for strata card fraud demo:
1) Copy memgraph to cluster
2) Copy durability directories to cluster
3) Start master and workers
4) Stop memgraph cluster
5) Clean memgraph directories
6) Clean durability directories
7) Collect memgraph logs
8) Start and stop tcpdump
Cluster config should be provided in separate json file.
Example usage:
```./manage_distributed_card_fraud memgraph-copy --config config.json```
Example json config:
{
"workload_machines":
[
{
"host" : "distcardfraud2.memgraph.io",
"type" : "master",
"address" : "10.1.13.5",
"port" : 10000,
"num_workers" : 4,
"rpc_num_workers" : 4,
"ssh_port" : 60022
},
{
"host" : "distcardfraud3.memgraph.io",
"type" : "worker",
"address" : "10.1.13.6",
"port" : 10001,
"num_workers" : 2,
"rpc_num_workers" : 2,
"ssh_port" : 60022
},
{
"host" : "distcardfraud4.memgraph.io",
"type" : "worker",
"address" : "10.1.13.7",
"port" : 10002,
"num_workers" : 2,
"rpc_num_workers" : 2,
"ssh_port" : 60022
}
],
"benchmark_machines":["distcardfraud1.memgraph.io"],
"statsd_address": "10.1.13.4",
"statsd_port" : "2500",
"remote_user": "memgraph",
"ssh_public_key" : "~/.ssh/id_rsa",
"memgraph_build_dir" : "~/Development/memgraph/build",
"memgraph_remote_dir" : "/home/memgraph/memgraph_remote",
"durability_dir" : "~/Development/memgraph/build/tests/manual/test_dir",
"durability_remote_dir" : "/home/memgraph/memgraph_remote/durability",
"logs_dir" : "~/Development/memgraph/logs",
"logs_remote_dir" : "/home/memgraph/memgraph_remote/logs",
"tcpdump_dir" : "~/Development/memgraph/tcpdumps",
"tcpdump_remote_dir" : "/home/memgraph/tcpdumps"
}
"""
logging.basicConfig(level=logging.INFO)
log = logging.getLogger("RemoteRunner")
def parse_args():
"""
Parse command line arguments
"""
parser = ArgumentParser(description=__doc__)
actions = [func for func in dir(RemoteRunner)
if callable(getattr(RemoteRunner, func))]
actions = [action.replace('_', '-') for action in actions
if not action.startswith('_')]
parser.add_argument("action", metavar="action", choices=actions,
help=", ".join(actions))
parser.add_argument("--config", default="config.json",
help="Config for cluster.")
return parser.parse_args()
class Machine(namedtuple('Machine', ['host', 'type', 'address',
'port', 'num_workers',
'rpc_num_workers', 'ssh_port'])):
__slots__ = () # prevent creation of instance dictionaries
def __init__(self, **kwargs):
assert isinstance(self.port, int), "port must be an integer"
assert isinstance(self.num_workers, int), "num_workers must be \
an integer"
assert isinstance(self.rpc_num_workers, int), "rpc_num_workers must be \
an integer"
assert isinstance(self.ssh_port, int), "ssh_port must be an integer"
class Config:
def __init__(self, config_file):
data = json.load(open(config_file))
self.workload_machines = [Machine(**config)
for config in data["workload_machines"]]
self.benchmark_machines = data["benchmark_machines"]
self.statsd_address = data["statsd_address"]
self.statsd_port = data["statsd_port"]
self.remote_user = data["remote_user"]
self.ssh_public_key = data["ssh_public_key"]
self.ssh_public_key = str(Path(self.ssh_public_key).expanduser())
self.memgraph_build_dir = data["memgraph_build_dir"]
self.memgraph_build_dir = str(
Path(self.memgraph_build_dir).expanduser())
self.memgraph_remote_dir = data["memgraph_remote_dir"]
self.durability_dir = data["durability_dir"]
self.durability_dir = str(Path(self.durability_dir).expanduser())
self.durability_remote_dir = data["durability_remote_dir"]
self.logs_dir = data["logs_dir"]
self.logs_dir = str(Path(self.logs_dir).expanduser())
self.logs_remote_dir = data["logs_remote_dir"]
self.tcpdump_dir = data["tcpdump_dir"]
self.tcpdump_dir = str(Path(self.tcpdump_dir).expanduser())
self.tcpdump_remote_dir = data["tcpdump_remote_dir"]
log.info("Initializing with config\n{}".format(
json.dumps(data, indent=4, sort_keys=True)))
def master(self):
return next(filter(lambda m: m.type == "master",
self.workload_machines), None)
def workers(self):
return list(filter(lambda m: m.type == "worker",
self.workload_machines))
class RemoteRunner:
def __init__(self, config):
self._config = config
self._ssh_args = ["-i", self._config.ssh_public_key]
self._scp_args = ["-i", self._config.ssh_public_key]
self._ssh_nohost_cmd = ["ssh"] + self._ssh_args + ["-p"]
self.executor = concurrent.futures.ThreadPoolExecutor(
max_workers=multiprocessing.cpu_count())
def _run_cmd(self, cmd, stdout=None, stderr=subprocess.PIPE,
host=None, user=None, ssh_port=None, **kwargs):
if not stdout:
stdout = open("/dev/null", "w")
if cmd[0] != "scp":
assert host is not None, "Remote host not specified"
assert ssh_port is not None, "ssh port not specified"
where = "{}@{}".format(user, host)
cmd = self._ssh_nohost_cmd + [str(ssh_port)] + [where] + cmd
log.info("Command: {}".format(cmd))
ret = subprocess.run(cmd, stdout=stdout, stderr=stderr, **kwargs)
err = ret.stderr.decode("utf-8")
if err != "":
log.error("Command: {} - ERROR: {}".format(cmd, err))
if stdout == subprocess.PIPE:
return (ret.returncode, ret.stdout.decode("utf-8"))
return ret.returncode
def _mkdir_machine(self, machine, dir_name):
log.info("mkdir {} on {}".format(dir_name, machine.host))
ret = self._run_cmd(["mkdir", "-p", dir_name],
user=self._config.remote_user,
host=machine.host, ssh_port=machine.ssh_port)
log.info(ret)
def _mkdir(self, dir_name="tmp"):
log.info("mkdir {}".format(dir_name))
fs = self.executor.map(self._mkdir_machine,
self._config.workload_machines,
itertools.repeat(dir_name))
for _ in fs: pass # noqa
def _listdir(self):
# for testing purposes only
log.info("listdir")
machine = self._config.workload_machines[0]
ret = self._run_cmd(["ls", "-la"],
user=self._config.remote_user, host=machine.host,
stdout=subprocess.PIPE, ssh_port=machine.ssh_port)
log.info("\n{}".format(ret[1]))
def _memgraph_symlink(self, memgraph_version, machine):
log.info("memgraph-symlink on {}".format(machine.host))
# create or update a symlink if already exists
ret = self._run_cmd(["ln", "-sf",
os.path.join(self._config.memgraph_remote_dir,
memgraph_version),
os.path.join(self._config.memgraph_remote_dir,
"memgraph")],
user=self._config.remote_user,
host=machine.host, ssh_port=machine.ssh_port)
log.info(ret)
def _memgraph_copy_machine(self, machine, memgraph_binary):
log.info("memgraph-copy on {}".format(machine.host))
args = ["scp"] + self._scp_args + ["-P", str(machine.ssh_port)]
args += [memgraph_binary,
self._config.remote_user + "@" + machine.host + ":" +
self._config.memgraph_remote_dir]
ret = self._run_cmd(args)
log.info(ret)
self._memgraph_symlink(os.path.basename(memgraph_binary), machine)
def memgraph_copy(self):
log.info("memgraph-copy")
self._mkdir(self._config.memgraph_remote_dir)
self._mkdir(self._config.durability_remote_dir)
self._mkdir(self._config.logs_remote_dir)
memgraph_binary = os.path.realpath(
self._config.memgraph_build_dir + "/memgraph")
fs = self.executor.map(self._memgraph_copy_machine,
self._config.workload_machines,
itertools.repeat(memgraph_binary))
for _ in fs: pass # noqa
def _memgraph_clean_machine(self, machine):
log.info("memgraph-clean on {}".format(machine.host))
ret = self._run_cmd(["rm", "-rf",
self._config.memgraph_remote_dir],
user=self._config.remote_user,
host=machine.host, ssh_port=machine.ssh_port)
log.info(ret)
def memgraph_clean(self):
log.info("memgraph-clean")
fs = self.executor.map(self._memgraph_clean_machine,
self._config.workload_machines)
for _ in fs: pass # noqa
def _durability_copy_machine(self, machine, i):
log.info("durability-copy on {}".format(machine.host))
# copy durability dir
args = ["scp", "-r"] + self._scp_args + ["-P",
str(machine.ssh_port)]
args += [os.path.join(self._config.durability_dir,
"worker_{}/snapshots".format(i)),
self._config.remote_user + "@" + machine.host + ":" +
self._config.durability_remote_dir]
ret = self._run_cmd(args)
log.info(ret)
def durability_copy(self):
log.info("durability-copy")
self._mkdir(self._config.durability_remote_dir)
fs = self.executor.map(self._durability_copy_machine,
self._config.workload_machines,
itertools.count(0))
for _ in fs: pass # noqa
def _durability_clean_machine(self, machine):
log.info("durability-clean on {}".format(machine.host))
ret = self._run_cmd(["rm", "-rf",
self._config.durability_remote_dir],
user=self._config.remote_user,
host=machine.host, ssh_port=machine.ssh_port)
log.info(ret)
def durability_clean(self):
log.info("durability-clean")
fs = self.executor.map(self._durability_clean_machine,
self._config.workload_machines)
for _ in fs: pass # noqa
def _is_process_running(self, process_name, machine):
ret = self._run_cmd(["pgrep", process_name],
user=self._config.remote_user,
host=machine.host, ssh_port=machine.ssh_port)
log.info(ret)
return ret == 0
def memgraph_single_node(self):
log.info("memgraph-single-node")
machine = self._config.master()
assert machine is not None, "Unable to fetch memgraph config"
is_running = self._is_process_running("memgraph", machine)
assert not is_running, "Memgraph already running on machine: "\
" {}".format(machine)
dir_cmd = ["cd", self._config.memgraph_remote_dir]
memgraph_cmd = ["nohup", "./memgraph",
"--durability-directory={}".format(
self._config.durability_remote_dir),
"--db-recover-on-startup=true",
"--num-workers", str(machine.num_workers),
"--rpc-num-workers", str(machine.rpc_num_workers),
"--log-file",
os.path.join(self._config.logs_remote_dir,
"log_worker_0"),
"--query-vertex-count-to-expand-existing", "-1"]
cmd = ["eval", "\""] + dir_cmd + ["&&"] + memgraph_cmd + \
["\"", "&> /dev/null < /dev/null &"]
ret = self._run_cmd(cmd,
user=self._config.remote_user, host=machine.host,
ssh_port=machine.ssh_port)
log.info(ret)
is_running = self._is_process_running("memgraph", machine)
assert is_running, "Memgraph failed to start on machine: "\
" {}".format(machine)
def memgraph_master(self):
log.info("memgraph-master")
machine = self._config.master()
assert machine is not None, "Unable to fetch master config"
is_running = self._is_process_running("memgraph", machine)
assert not is_running, "Memgraph already running on machine: "\
" {}".format(machine)
dir_cmd = ["cd", self._config.memgraph_remote_dir]
memgraph_cmd = ["nohup", "./memgraph",
"--master",
"--master-host", machine.address,
"--master-port", str(machine.port),
"--durability-directory={}".format(
self._config.durability_remote_dir),
"--db-recover-on-startup=true",
"--num-workers", str(machine.num_workers),
"--rpc-num-workers", str(machine.rpc_num_workers),
"--statsd-address", str(self._config.statsd_address),
"--statsd-port", str(self._config.statsd_port),
"--log-file",
os.path.join(self._config.logs_remote_dir,
"log_worker_0"),
"--query-vertex-count-to-expand-existing", "-1"]
cmd = ["eval", "\""] + dir_cmd + ["&&"] + memgraph_cmd + \
["\"", "&> /dev/null < /dev/null &"]
ret = self._run_cmd(cmd,
user=self._config.remote_user, host=machine.host,
ssh_port=machine.ssh_port)
log.info(ret)
is_running = self._is_process_running("memgraph", machine)
assert is_running, "Memgraph failed to start on machine: "\
" {}".format(machine)
def _memgraph_worker(self, machine, master, i):
is_running = self._is_process_running("memgraph", machine)
assert not is_running, "Memgraph already running on machine: "\
" {}".format(machine)
dir_cmd = ["cd", self._config.memgraph_remote_dir]
worker_cmd = ["nohup", "./memgraph",
"--worker",
"--master-host", master.address,
"--master-port", str(master.port),
"--worker-id", str(i),
"--worker-port", str(machine.port),
"--worker-host", machine.address,
"--durability-directory={}".format(
self._config.durability_remote_dir),
"--db-recover-on-startup=true",
"--num-workers", str(machine.num_workers),
"--rpc-num-workers", str(machine.rpc_num_workers),
"--statsd-address", str(self._config.statsd_address),
"--statsd-port", str(self._config.statsd_port),
"--log-file",
os.path.join(self._config.logs_remote_dir,
"log_worker_{}".format(i))]
cmd = ["eval", "\""] + dir_cmd + ["&&"] + worker_cmd + \
["\"", "&> /dev/null < /dev/null &"]
ret = self._run_cmd(cmd,
user=self._config.remote_user,
host=machine.host, ssh_port=machine.ssh_port)
log.info(ret)
is_running = self._is_process_running("memgraph", machine)
assert is_running, "Memgraph failed to start on machine: "\
" {}".format(machine)
def memgraph_workers(self):
log.info("memgraph-workers")
master = self._config.master()
assert master is not None, "Unable to fetch master config"
workers = self._config.workers()
assert workers, "Unable to fetch workers config"
fs = self.executor.map(self._memgraph_worker,
self._config.workload_machines[1:],
itertools.repeat(master),
itertools.count(1))
for _ in fs: pass # noqa
def _kill_process(self, machine, process, signal):
ret = self._run_cmd(["sudo", "kill", "-" + str(signal),
"$(pgrep {})".format(process)],
user=self._config.remote_user,
host=machine.host, ssh_port=machine.ssh_port)
log.info(ret)
def memgraph_terminate(self):
log.info("memgraph-terminate")
# only kill master - master stops workers
machine = self._config.master()
self._kill_process(machine, "memgraph", 15)
def memgraph_kill(self):
log.info("memgraph-kill")
fs = self.executor.map(self._kill_process,
self._config.workload_machines,
itertools.repeat("memgraph"),
itertools.repeat(9))
for _ in fs: pass # noqa
def _collect_log(self, machine, i):
log.info("collect-log from {}".format(machine.host))
args = ["scp"] + self._scp_args + ["-P", str(machine.ssh_port)]
args += [self._config.remote_user + "@" + machine.host + ":" +
os.path.join(self._config.logs_remote_dir,
"log_worker_{}".format(i)),
self._config.logs_dir]
ret = self._run_cmd(args)
log.info(ret)
def collect_logs(self):
log.info("collect-logs")
fs = self.executor.map(self._collect_log,
self._config.workload_machines,
itertools.count(0))
for _ in fs: pass # noqa
def _tcpdump_machine(self, machine, i):
cmd = ["sudo", "tcpdump", "-i", "eth0", "-w", os.path.join(
self._config.tcpdump_remote_dir,
"dump_worker_{}".format(i))]
cmd = ["eval", "\""] + cmd + ["\"", "&> /dev/null < /dev/null &"]
ret = self._run_cmd(cmd,
user=self._config.remote_user,
host=machine.host, ssh_port=machine.ssh_port)
log.info(ret)
def tcpdump_start(self):
log.info("tcpdump-start")
self._mkdir(self._config.tcpdump_remote_dir)
fs = self.executor.map(self._tcpdump_machine,
self._config.workload_machines,
itertools.count(0))
for _ in fs: pass # noqa
def _tcpdump_collect_machine(self, machine, i):
log.info("collect-tcpdump from {}".format(machine.host))
# first kill tcpdump process
ret = self._run_cmd(["nohup", "sudo", "kill", "-15",
"$(pgrep tcpdump)"],
user=self._config.remote_user,
host=machine.host, ssh_port=machine.ssh_port)
log.info(ret)
args = ["scp"] + self._scp_args + ["-P", str(machine.ssh_port)]
args += [self._config.remote_user + "@" + machine.host + ":" +
os.path.join(self._config.tcpdump_remote_dir,
"dump_worker_{}".format(i)),
self._config.tcpdump_dir]
ret = self._run_cmd(args)
log.info(ret)
def tcpdump_collect(self):
log.info("tcpdump-collect")
fs = self.executor.map(self._tcpdump_collect_machine,
self._config.workload_machines,
itertools.count(0))
for _ in fs: pass # noqa
def tcpdump_clean(self):
log.info("tcpdump-clean")
for machine in self._config.workload_machines:
log.info("tcpdump-clean on {}".format(machine.host))
ret = self._run_cmd(["rm", "-rf",
self._config.tcpdump_remote_dir],
user=self._config.remote_user,
host=machine.host, ssh_port=machine.ssh_port)
log.info(ret)
def main():
args = parse_args()
config = Config(args.config)
action = args.action.replace("-", "_")
runner = RemoteRunner(config)
action = getattr(runner, action)
action()
if __name__ == "__main__":
main()