Integrate distributed card fraud test with Apollo

Reviewers: buda Reviewed By: buda Subscribers: pullbot Differential Revision: https://phabricator.memgraph.io/D1247
2018-02-27 18:57:02 +01:00 · 2018-02-27 18:57:02 +01:00 · 7e945c6667
commit 7e945c6667
parent 19abd0ce02
10 changed files with 385 additions and 34 deletions
--- a/apollo_build.yaml
+++ b/apollo_build.yaml
@ -24,7 +24,7 @@
    cd build_release
    cmake -DCMAKE_BUILD_TYPE=release ..
-    TIMEOUT=1000 make -j$THREADS memgraph memgraph__macro_benchmark memgraph__stress
+    TIMEOUT=1000 make -j$THREADS memgraph memgraph__macro_benchmark memgraph__stress memgraph__manual__card_fraud_generate_snapshot
    cd ../../parent
@ -40,6 +40,10 @@
    cd ../../memgraph/tools
    TIMEOUT=300 ./setup
    # Generate distributed card fraud dataset
    cd ../tests/distributed/card_fraud
    ./generate_dataset.sh
 - name: Debug build
  project: ^mg-master-debug$
--- a/tests/distributed/card_fraud/.gitignore
+++ b/tests/distributed/card_fraud/.gitignore
@ -0,0 +1,2 @@
 output
 snapshots
--- a/tests/distributed/card_fraud/apollo_runs.py
+++ b/tests/distributed/card_fraud/apollo_runs.py
@ -0,0 +1,45 @@
 #!/usr/bin/env python3
 import json
 import os
 import re
 import subprocess
 from card_fraud import NUM_MACHINES, BINARIES
 # paths
 SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__))
 WORKSPACE_DIR = os.path.normpath(os.path.join(SCRIPT_DIR, "..", "..", "..", ".."))
 OUTPUT_DIR_REL = os.path.join(os.path.relpath(SCRIPT_DIR, WORKSPACE_DIR), "output")
 # generate runs
 runs = []
 binaries = list(map(lambda x: os.path.join("..", "..", "build_release", x), BINARIES))
 for i in range(NUM_MACHINES):
    name = "master" if i == 0 else "worker" + str(i)
    additional = ["master.py"] if i == 0 else []
    outfile_paths = ["\\./" + OUTPUT_DIR_REL + "/.+"] if i == 0 else []
    if i == 0:
        cmd = "master.py --machines-num {0} --test-suite card_fraud " \
                "--test card_fraud".format(NUM_MACHINES)
    else:
        cmd = "jail_service.py"
    runs.append({
        "name": "distributed__card_fraud__" + name,
        "cd": "..",
        "supervisor": cmd,
        "infiles": binaries + [
            "common.py",
            "jail_service.py",
            "card_fraud/card_fraud.py",
            "card_fraud/snapshots/worker_" + str(i),
            "../../libs/postgresql/lib",
        ] + additional,
        "outfile_paths": outfile_paths,
        "parallel_run": "distributed__card_fraud",
        "slave_group": "remote_4c32g",
        "enable_network": True,
    })
 print(json.dumps(runs, indent=4, sort_keys=True))
--- a/tests/distributed/card_fraud/card_fraud.py
+++ b/tests/distributed/card_fraud/card_fraud.py
@ -0,0 +1,222 @@
 import json
 import os
 import time
 # to change the size of the cluster, just change this parameter
 NUM_MACHINES = 3
 # test setup
 SCENARIOS = ["point_lookup", "create_tx_without_edge"]
 DURATION = 300
 WORKERS = 6
 # constants
 SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__))
 MEMGRAPH_BINARY = "memgraph"
 CLIENT_BINARY   = "tests/macro_benchmark/card_fraud_client"
 BINARIES = [MEMGRAPH_BINARY, CLIENT_BINARY]
 # wrappers
 class WorkerWrapper:
    def __init__(self, address, worker):
        self._address = address
        self._worker = worker
        self._tid = worker.get_jail()
    def get_address(self):
        return self._address
    def __getattr__(self, name):
        if name in ["allocate_file", "read_file", "store_label"]:
            return getattr(self._worker, name)
        def func(*args, **kwargs):
            args = [self._tid] + list(args)
            return getattr(self._worker, name)(*args, **kwargs)
        return func
 class MgCluster:
    def __init__(self, machine_ids, workers):
        # create wrappers
        self._master = WorkerWrapper(os.environ[machine_ids[0]],
                workers[machine_ids[0]])
        self._workers = []
        for machine_id in machine_ids[1:]:
            self._workers.append(WorkerWrapper(os.environ[machine_id],
                    workers[machine_id]))
    def start(self):
        # start memgraph master
        self._master.start(MEMGRAPH_BINARY, [
            "--master",
            "--master-host", self._master.get_address(),
            "--master-port", "10000",
            "--durability-directory", os.path.join(SCRIPT_DIR, "snapshots",
                                                   "worker_0"),
            "--db-recover-on-startup",
            "--query-vertex-count-to-expand-existing", "-1",
            "--num-workers", str(WORKERS),
            "--rpc-num-workers", str(WORKERS),
        ])
        # sleep to allow the master to startup
        time.sleep(5)
        # start memgraph workers
        for i, worker in enumerate(self._workers, start=1):
            worker.start(MEMGRAPH_BINARY, [
                "--worker", "--worker-id", str(i),
                "--worker-host", worker.get_address(),
                "--worker-port", str(10000 + i),
                "--master-host", self._master.get_address(),
                "--master-port", "10000",
                "--durability-directory", os.path.join(SCRIPT_DIR, "snapshots",
                                                       "worker_" + str(i)),
                "--db-recover-on-startup",
                "--num-workers", str(WORKERS),
                "--rpc-num-workers", str(WORKERS),
            ])
        # sleep to allow the workers to startup
        time.sleep(5)
        # store initial usage
        self._usage_start = [self._master.get_usage()]
        for worker in self._workers:
            self._usage_start.append(worker.get_usage())
        self._usage_start_time = time.time()
    def get_master_address(self):
        return self._master.get_address()
    def check_status(self):
        if not self._master.check_status():
            return False
        for worker in self._workers:
            if not worker.check_status():
                return False
        return True
    def stop(self):
        # store final usage
        self._usage_stop = [self._master.get_usage()]
        for worker in self._workers:
            self._usage_stop.append(worker.get_usage())
        self._usage_stop_time = time.time()
        # stop the master
        self._master.stop()
        # wait to allow the master and workers to die
        time.sleep(5)
        # stop the workers
        for worker in self._workers:
            worker.stop()
        # wait to allow the workers to die
        time.sleep(5)
    def get_usage(self):
        ret = []
        tdelta = self._usage_stop_time - self._usage_start_time
        for val_start, val_stop in zip(self._usage_start, self._usage_stop):
            data = {
                "cpu": (val_stop["cpu"] - val_start["cpu"]) / tdelta,
                "memory": val_stop["max_memory"] / 1024,
                "threads": val_stop["max_threads"],
                "network": {}
            }
            net_start = val_start["network"]["eth0"]
            net_stop = val_stop["network"]["eth0"]
            for i in ["bytes", "packets"]:
                data["network"][i] = {}
                for j in ["rx", "tx"]:
                    data["network"][i][j] = (net_stop[i][j] -
                            net_start[i][j]) / tdelta
            ret.append(data)
        return ret
    def store_label(self, label):
        self._master.store_label(label)
        for worker in self._workers:
            worker.store_label(label)
 def write_scenario_summary(scenario, throughput, usage, output):
    output.write("Scenario **{}** throughput !!{:.2f}!! queries/s.\n\n".format(
            scenario, throughput))
    headers = ["Memgraph", "CPU", "Max memory", "Max threads",
            "Network RX", "Network TX"]
    output.write("<table>\n<tr>")
    for header in headers:
        output.write("<th>{}</th>".format(header))
    output.write("</tr>\n")
    for i, current in enumerate(usage):
        name = "master" if i == 0 else "worker" + str(i)
        output.write("<tr><td>{}</td>".format(name))
        for key, unit in [("cpu", "s/s"), ("memory", "MiB"), ("threads", "")]:
            fmt = ".2f" if key != "threads" else ""
            output.write(("<td>{:" + fmt + "} {}</td>").format(
                    current[key], unit).strip())
        for key in ["rx", "tx"]:
            output.write("<td>{:.2f} packets/s</td>".format(
                    current["network"]["packets"][key]))
        output.write("</tr>\n")
    output.write("</table>\n\n")
 # main test function
 def run(machine_ids, workers):
    # create output directory
    output_dir = os.path.join(SCRIPT_DIR, "output")
    if not os.path.exists(output_dir):
        os.mkdir(output_dir)
    # create memgraph cluster and client
    mg_cluster = MgCluster(machine_ids, workers)
    mg_client = WorkerWrapper(os.environ[machine_ids[0]],
            workers[machine_ids[0]])
    # execute the tests
    stats = {}
    for scenario in SCENARIOS:
        output_file = os.path.join(output_dir, scenario + ".json")
        print("Starting memgraph cluster")
        mg_cluster.store_label("Start: cluster")
        mg_cluster.start()
        print("Starting client scenario:", scenario)
        mg_cluster.store_label("Start: " + scenario)
        mg_client.start(CLIENT_BINARY, [
            "--address", mg_cluster.get_master_address(),
            "--group", "card_fraud",
            "--scenario", scenario,
            "--duration", str(DURATION),
            "--num-workers", str(WORKERS),
            "--output", output_file,
        ])
        # wait for the client to terminate and check the cluster status
        while mg_client.check_status():
            assert mg_cluster.check_status(), "The memgraph cluster has died!"
            time.sleep(2)
        # stop everything
        mg_client.wait()
        mg_cluster.store_label("Stop: " + scenario)
        mg_cluster.stop()
        mg_cluster.store_label("Stop: cluster")
        # process the stats
        data = json.loads(list(filter(lambda x: x.strip(),
                open(output_file).read().split("\n")))[-1])
        throughput = data["num_executed_queries"] / data["elapsed_time"]
        usage = mg_cluster.get_usage()
        stats[scenario] = (throughput, usage)
    # dump the stats
    stats_file = open(os.path.join(output_dir, ".card_fraud_summary"), "w")
    stats_file.write("==== Distributed card fraud summary: ====\n\n")
    for scenario in SCENARIOS:
        throughput, usage = stats[scenario]
        write_scenario_summary(scenario, throughput, usage, stats_file)
    stats_file.close()
--- a/tests/distributed/card_fraud/config.json
+++ b/tests/distributed/card_fraud/config.json
@ -0,0 +1,20 @@
 {
 "indexes" : ["Card.id", "Pos.id", "Transaction.fraud_reported"],
 "nodes" : [
   {
     "count_per_worker" : 10000,
     "label" : "Card"
   },
   {
     "count_per_worker" : 1000,
     "label" : "Pos"
   },
   {
     "count_per_worker" : 50000,
     "label" : "Transaction"
   }
 ],
 "compromised_pos_probability" : 0.2,
 "fraud_reported_probability" : 0.1,
 "hop_probability" : 0.1
 }
--- a/tests/distributed/card_fraud/generate_dataset.sh
+++ b/tests/distributed/card_fraud/generate_dataset.sh
@ -0,0 +1,14 @@
 #!/bin/bash
 script_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
 cd $script_dir
 output_dir=snapshots
 if [ -d $output_dir ]; then
    rm -rf $output_dir
 fi
 NUM_MACHINES="$( cat card_fraud.py | grep -m1 "NUM_MACHINES" | tail -c 2 )"
 ../../../build_release/tests/manual/card_fraud_generate_snapshot --config config.json --num-workers $NUM_MACHINES --dir $output_dir
--- a/tests/distributed/jail_service.py
+++ b/tests/distributed/jail_service.py
@ -38,6 +38,12 @@ class JailService:
        self._generated_filenames = []
        self.tempdir = tempfile.TemporaryDirectory()
    def _get_proc(self, tid):
        if tid not in self.processes:
            raise Exception(
                "Binary with tid {tid} does not exist".format(tid=tid))
        return self.processes[tid]
    def start(self, tid, binary_name, binary_args=None):
        self.log.info("Starting Binary: {binary}".format(binary=binary_name))
        self.log.info("With args: {args}".format(args=binary_args))
@ -50,7 +56,7 @@ class JailService:
            binary = get_absolute_path(binary_name, "build_release")
        # fetch process
-        proc = self.processes[tid]
+        proc = self._get_proc(tid)
        # start binary
        proc.run(binary, args=binary_args, timeout=600)
@ -59,13 +65,29 @@ class JailService:
            binary=binary_name, tid=proc._tid)
        self.log.info(msg)
    def check_status(self, tid):
        proc = self._get_proc(tid)
        status = proc.get_status()
        if status is None: return True
        assert status == 0, "The binary exited with a non-zero status!"
        return False
    def get_usage(self, tid):
        usage = self._get_proc(tid).get_usage()
        usage.update({"network": jail.get_network_usage()})
        return usage
    def wait(self, tid):
        proc = self._get_proc(tid)
        proc.wait()
    def stop(self, tid):
        self.log.info("Stopping binary with tid {tid}".format(tid=tid))
-        if tid not in self.processes:
+        proc = self._get_proc(tid)
-            raise Exception(
+        try:
                "Binary with tid {tid} does not exist".format(tid=tid))
        proc = self.processes[tid]
            proc.send_signal(jail.SIGTERM)
        except Exception:
            pass
        proc.wait()
        self.log.info("Binary with tid {tid} stopped".format(tid=tid))
@ -88,6 +110,13 @@ class JailService:
        self.processes[proc._tid] = proc
        return proc._tid
    def store_label(self, label):
        jail.store_label(label)
    def shutdown(self):
        self.log.info("Stopping Jail Service")
        os._exit(0)
 def main():
    # set port dynamically
--- a/tests/distributed/local_runner
+++ b/tests/distributed/local_runner
@ -38,6 +38,7 @@ done
 quit()
 {
    # Stop workers
    sleep 1
    for i in `seq 2 $NUM_MACHINES`;
    do
        kill ${pids[$i]}
--- a/tests/distributed/master.py
+++ b/tests/distributed/master.py
@ -1,4 +1,5 @@
 #!/usr/bin/env python3
 import atexit
 import importlib
 import logging
 import os
@ -35,6 +36,7 @@ def main(args):
    workers = {}
    machine_ids = []
    machines_num = int(args.machines_num)
    # initialize workers
    for i in range(machines_num):
        id = i + 1
@ -54,6 +56,15 @@ def main(args):
        workers[machine_id] = worker
    # cleanup at exit
    @atexit.register
    def cleanup():
        for machine_id in machine_ids[1:]:
            try:
                workers[machine_id].shutdown()
            except ConnectionRefusedError:
                pass
    # run test
    test = importlib.import_module(
        "{suite}.{test}".format(suite=args.test_suite, test=args.test))
--- a/tests/macro_benchmark/jail_faker.py
+++ b/tests/macro_benchmark/jail_faker.py
@ -1,6 +1,6 @@
 #!/usr/bin/python3
 import atexit
 import copy
 import json
 import os
 import resource
@ -13,10 +13,6 @@ import uuid
 from signal import *
 SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__))
 STORAGE_DIR = os.path.join(SCRIPT_DIR, ".storage")
 class ProcessException(Exception):
    pass
@ -63,14 +59,14 @@ class Process:
    def run_and_wait(self, *args, **kwargs):
        check = kwargs.pop("check", True)
        self.run(*args, **kwargs)
-        return self.wait()
+        return self.wait(check)
    def wait(self, check = True):
        if self._proc == None:
            raise ProcessException
        self._proc.wait()
        if check and self._proc.returncode != 0:
-            raise ProcessException
+            raise ProcessException("Command returned non-zero!")
        return self._proc.returncode
    def get_status(self):
@ -87,7 +83,7 @@ class Process:
    def get_usage(self):
        if self._proc == None:
            raise ProcessException
-        return self._usage
+        return copy.deepcopy(self._usage)
    # this is implemented only in the real API
    def set_cpus(self, cpus, hyper=True):
@ -136,9 +132,11 @@ class Process:
        except:
            return
        # for a description of these fields see: man proc; man times
-        cpu_time = sum(map(lambda x: int(x) / self._ticks_per_sec,
+        utime, stime, cutime, cstime = map(
-                           data_stat[13:17]))
+                lambda x: int(x) / self._ticks_per_sec, data_stat[13:17])
-        self._set_usage(cpu_time, "cpu", only_value = True)
+        self._set_usage(utime + stime + cutime + cstime, "cpu", only_value=True)
        self._set_usage(utime + cutime, "cpu_user", only_value=True)
        self._set_usage(stime + cstime, "cpu_sys", only_value=True)
        self._set_usage(int(data_stat[19]), "threads")
        mem_vm, mem_res, mem_shr = map(
                lambda x: int(x) * self._page_size // 1024, data_statm[:3])
@ -161,7 +159,6 @@ class Process:
 PROCESSES_NUM = 8
 _processes = [Process(i) for i in range(1, PROCESSES_NUM + 1)]
 _last_process = 0
 _thread_run = True
 def _usage_updater():
    while True:
@ -172,20 +169,12 @@ def _usage_updater():
 _thread = threading.Thread(target=_usage_updater, daemon=True)
 _thread.start()
 if not os.path.exists(STORAGE_DIR):
    os.mkdir(STORAGE_DIR)
 _storage_name = os.path.join(
        STORAGE_DIR, time.strftime("%Y%m%d%H%M%S") + ".json")
 _storage_file = open(_storage_name, "w")
@atexit.register
 def cleanup():
    for proc in _processes:
        if proc._proc == None: continue
        proc.send_signal(SIGKILL)
        proc.get_status()
    _storage_file.close()
 # end of private methods ------------------------------------------------------
@ -230,11 +219,25 @@ def get_host_info():
    return {"cpus": cpus, "memory": memory, "hyperthreading": hyper,
            "threads": threads}
 # placeholder function that stores a label in the real API
 def store_label(label):
    if type(label) != str:
        raise Exception("Label must be a string!")
 # this function is deprecated
 def store_data(data):
-    if not type(data) == dict:
+    pass
-        raise StorageException("Data must be a dictionary!")
+
-    for i in ["unit", "type", "value"]:
+# placeholder function that returns real data in the real API
-        if not i in data:
+def get_network_usage():
-            raise StorageException("Field '{}' missing in data!".format(i))
+    usage = {
-    data["timestamp"] = time.time()
+        "lo": {
-    _storage_file.write(json.dumps(data) + "\n")
+            "bytes": {"rx": 0, "tx": 0},
            "packets": {"rx": 0, "tx": 0}
        },
        "eth0": {
            "bytes": {"rx": 0, "tx": 0},
            "packets": {"rx": 0, "tx": 0}
        }
    }
    return usage