Integrated C++ CSV to snapshot into LDBC.

Reviewers: mislav.bradac Reviewed By: mislav.bradac Subscribers: pullbot Differential Revision: https://phabricator.memgraph.io/D826
2017-09-23 22:43:58 +02:00 · 2017-09-23 22:43:58 +02:00 · 99494d77e3
commit 99494d77e3
parent d9503d6b65
9 changed files with 55 additions and 42 deletions
--- a/tests/public_benchmark/ldbc/README.md
+++ b/tests/public_benchmark/ldbc/README.md
@ -9,8 +9,7 @@
    ./run_benchmark --create-index --run-db memgraph # or neo4j
    # To run update queries pass the properties file for updates and slow down
    # the execution by setting a larger time compression ratio.
-    ./run_benchmark --create-index --run-db memgraph \
-                    --properties-file ldbc-snb-impls-updates.properties \
+    ./run_benchmark --create-index --run-db memgraph --test-type updates \
                    --time-compression-ratio 1.5

 ## How to run a specific test?
--- a/tests/public_benchmark/ldbc/continuous_integration
+++ b/tests/public_benchmark/ldbc/continuous_integration
@ -12,6 +12,6 @@ TIMEOUT=3600 ./run_benchmark --run-db neo4j --create-index --thread-count $THREA
 ./ve3/bin/python3 ../../../tools/plot_ldbc_latency --results results/read-memgraph-scale_1-LDBC-results.json results/read-neo4j-scale_1-LDBC-results.json --logo-path plots/ldbc-logo.png --plot-title "Read queries, scale 1" --output plots/read-queries-scale_1.png

 # run update benchmarks
-TIMEOUT=3600 ./run_benchmark --run-db memgraph --create-index --thread-count $THREADS --result-file-prefix update --time-compression-ratio 1.5 --properties-file ldbc-snb-impls-updates.properties
-TIMEOUT=3600 ./run_benchmark --run-db neo4j --create-index --thread-count $THREADS --result-file-prefix update --time-compression-ratio 1.5 --properties-file ldbc-snb-impls-updates.properties
+TIMEOUT=3600 ./run_benchmark --run-db memgraph --create-index --thread-count $THREADS --result-file-prefix update --test-type updates --time-compression-ratio 1.5
+TIMEOUT=3600 ./run_benchmark --run-db neo4j --create-index --thread-count $THREADS --result-file-prefix update --test-type updates --time-compression-ratio 1.5
 ./ve3/bin/python3 ../../../tools/plot_ldbc_latency --results results/update-memgraph-scale_1-LDBC-results.json results/update-neo4j-scale_1-LDBC-results.json --logo-path plots/ldbc-logo.png --plot-title "Update queries, scale 1" --output plots/update-queries-scale_1.png
--- a/tests/public_benchmark/ldbc/index_creation.py
+++ b/tests/public_benchmark/ldbc/index_creation.py
@ -29,6 +29,7 @@ session = driver.session()

 # The fist program argument is path to a file with indexes.
 with open(args.indexfile, "r") as f:
+    print("Starting index creation...")
    for line in f.readlines():
        session.run(line.strip()).consume()
        print("%s -> DONE" % line.strip())
--- a/tests/public_benchmark/ldbc/ldbc-snb-impls-short-reads.properties
+++ b/tests/public_benchmark/ldbc/ldbc-snb-impls-short-reads.properties
@ -43,17 +43,17 @@ ldbc.snb.interactive.LdbcQuery14_freq=49

 # At least one needs to be enabled in order to cause interleaving of short
 # reads. To get all the reads, you should run with a large operation count.
-ldbc.snb.interactive.LdbcQuery1_enable=false
-ldbc.snb.interactive.LdbcQuery2_enable=false
+ldbc.snb.interactive.LdbcQuery1_enable=true
+ldbc.snb.interactive.LdbcQuery2_enable=true
 ldbc.snb.interactive.LdbcQuery3_enable=false
-ldbc.snb.interactive.LdbcQuery4_enable=false
-ldbc.snb.interactive.LdbcQuery5_enable=false
+ldbc.snb.interactive.LdbcQuery4_enable=true
+ldbc.snb.interactive.LdbcQuery5_enable=true
 ldbc.snb.interactive.LdbcQuery6_enable=false
 ldbc.snb.interactive.LdbcQuery7_enable=false
 ldbc.snb.interactive.LdbcQuery8_enable=true
 ldbc.snb.interactive.LdbcQuery9_enable=false
 ldbc.snb.interactive.LdbcQuery10_enable=false
-ldbc.snb.interactive.LdbcQuery11_enable=false
+ldbc.snb.interactive.LdbcQuery11_enable=true
 ldbc.snb.interactive.LdbcQuery12_enable=false
 ldbc.snb.interactive.LdbcQuery13_enable=false
 ldbc.snb.interactive.LdbcQuery14_enable=false
--- a/tests/public_benchmark/ldbc/run_benchmark
+++ b/tests/public_benchmark/ldbc/run_benchmark
@ -44,7 +44,7 @@ class Memgraph:
        database_args = [binary, "--num-workers", self.num_workers,
                         "--snapshot-directory", os.path.join(self.dataset,
                                                              "memgraph"),
-                         "--recover-on-startup", "true",
+                         "--snapshot-recover-on-startup", "true",
                         "--port", self.port]

        # database env
@ -119,12 +119,12 @@ def parse_args():
                      help='Dataset scale to use for benchmarking.')
    argp.add_argument('--host', default='127.0.0.1', help='Database host.')
    argp.add_argument('--port', default='7687', help='Database port.')
-    argp.add_argument('--time-compression-ratio', type=float, default=0.01,
+    argp.add_argument('--time-compression-ratio', type=float, default=0.001,
                      help='Compress/stretch durations between operation start '
                           'times to increase/decrease benchmark load. '
                           'E.g. 2.0 = run benchmark 2x slower, 0.1 = run '
-                           'benchmark 10x faster. Default is 0.01.')
-    argp.add_argument('--operation-count', type=int, default=200,
+                           'benchmark 10x faster. Default is 0.001.')
+    argp.add_argument('--operation-count', type=int, default=1000,
                      help='Number of operations to generate during benchmark '
                           'execution.')
    argp.add_argument('--thread-count', type=int, default=8,
@ -136,10 +136,8 @@ def parse_args():
                      help='Time unit to use for measuring performance metrics')
    argp.add_argument('--result-file-prefix', default='',
                      help='Result file name prefix')
-    argp.add_argument('--properties-file',
-                      default=os.path.join(
-                          SCRIPT_DIR, 'ldbc-snb-impls-short-reads.properties'),
-                      help='Properties file used to select queries')
+    argp.add_argument('--test-type', choices=('reads', 'updates'),
+                      default='reads', help='Test queries of type')
    argp.add_argument('--run-db', choices=('memgraph', 'neo4j'),
                      help='Run the database before starting LDBC')
    argp.add_argument('--create-index', action='store_true', default=False,
@ -194,7 +192,8 @@ def main():
        parameters_dir = os.path.join(dataset, 'substitution_parameters')
        java_cmd = ('java', '-cp', cp, 'com.ldbc.driver.Client',
                    '-P', LDBC_DEFAULT_PROPERTIES,
-                    '-P', os.path.join(os.getcwd(), args.properties_file),
+                    '-P', os.path.join(SCRIPT_DIR, "ldbc-snb-impls-{}."
+                                       "properties".format(args.test_type)),
                    '-p', 'ldbc.snb.interactive.updates_dir', updates_dir,
                    '-p', 'host', args.host, '-p', 'port', args.port,
                    '-db', 'net.ellitron.ldbcsnbimpls.interactive.neo4j.Neo4jDb',
--- a/tests/public_benchmark/ldbc/setup
+++ b/tests/public_benchmark/ldbc/setup
@ -69,6 +69,7 @@ cd ${script_dir}
 #git clone https://phabricator.memgraph.io/source/ldbc-snb-impls.git
 git clone $deps_git_url/ldbc-snb-impls.git
 cd ldbc-snb-impls
+git checkout 4b3bc129e6991dfa9adc974bb8fc53258036e127
 sed -r '/(snb-interactive-tools|snb-interactive-titan|snb-interactive-torc)/s@^( +)(.+)$@\1<!--\2-->@' -i pom.xml
 $mvn install
 $mvn clean compile assembly:single
--- a/tools/apollo/build_release
+++ b/tools/apollo/build_release
@ -12,6 +12,10 @@ cd build
 cmake -DCMAKE_BUILD_TYPE=release ..
 TIMEOUT=1000 make -j$THREADS

-cd ../tools/apollo
+cd ../tools
+
+./setup
+
+cd apollo

 ./generate release
--- a/tools/plot_ldbc_latency
+++ b/tools/plot_ldbc_latency
@ -53,8 +53,6 @@ def parse_args():
                      default="ms", help="The time unit that should be used.")
    argp.add_argument("--output", default="",
                      help="Save plot to file (instead of displaying it).")
-    argp.add_argument("--max-label-width", default=11, type=int,
-                      help="Maximum length of the x-axis labels (-1 is unlimited)")
    return argp.parse_args()


@ -66,7 +64,7 @@ def autolabel(ax, rects):
        height = rect.get_height()
        # TODO: adjust more vendors
        ax.text(rect.get_x() + rect.get_width()/2., 1.00*height,
-                '%d' % int(height),
+                '{:.1f}'.format(height),
                ha='center', va='bottom')


@ -87,6 +85,15 @@ def main():
        vendors.append(vendor)
    assert len(vendors) == 2, "The graph is tailored for only 2 vendors."

+    # Helper for shortening the query name.
+    def shorten_query_name(name):
+        if name.lower().startswith("ldbc"):
+            name = name[4:]
+        # Long query names on the x-axis don't look compelling.
+        num = "".join(filter(lambda x: x in string.digits, name))
+        prefix = name.split(num)[0]
+        return prefix + num
+
    # Collect the benchmark data.
    print("LDBC Latency Data")
    for vendor in vendors:
@ -96,13 +103,19 @@ def main():
                mean_runtime = (query_data["run_time"]["mean"] /
                        LDBC_TIME_FACTORS[results_data["unit"]] *
                        TIME_FACTORS[args.time_unit])
-                query_name = query_data['name']
+                query_name = shorten_query_name(query_data['name'])
                vendor['results'].append((query_name, mean_runtime))

+    # Helper for sorting the results.
+    def sort_key(obj):
+        name = obj[0]
+        num = int("".join(filter(lambda x: x in string.digits, name)))
+        prefix = name.split(str(num))[0]
+        return (prefix, num)
+
    # Sort results.
    for vendor in vendors:
-        vendor['results'].sort(key=lambda item: int("".join(filter(
-                lambda x: x in string.digits, item[0]))))
+        vendor['results'].sort(key=sort_key)

    # Print results.
    for vendor in vendors:
@ -116,6 +129,15 @@ def main():
        "Queries between different vendors are different!"
    query_names = all_query_names[0]

+    # Font size.
+    plt.rc('font', size=12)          # controls default text sizes
+    plt.rc('axes', titlesize=24)     # fontsize of the axes title
+    plt.rc('axes', labelsize=16)    # fontsize of the x and y labels
+    plt.rc('xtick', labelsize=12)    # fontsize of the tick labels
+    plt.rc('ytick', labelsize=12)    # fontsize of the tick labels
+    plt.rc('legend', fontsize=16)    # legend fontsize
+    plt.rc('figure', titlesize=24)  # fontsize of the figure title
+
    # Plot.
    ind = np.arange(len(query_names))   # the x locations for the groups
    width = 0.40                        # the width of the bars
@ -124,20 +146,7 @@ def main():
    ax.set_ylabel('Mean Latency (%s)' % (args.time_unit))  # YAxis title
    ax.set_facecolor('#dcdcdc')         # plot bg color (light gray)
    ax.set_xticks(ind + width / len(vendors))  # TODO: adjust (more vendors)
-
-    def shorten_query_name(query_name):
-        # Long query names on the x-axis don't look compelling.
-        if query_name.lower().startswith('ldbc'):
-            query_name = query_name[4:]
-        if len(query_name) > args.max_label_width:
-            query_name = query_name[:args.max_label_width] + '\N{HORIZONTAL ELLIPSIS}'
-        return query_name
-    labels = query_names
-    if args.max_label_width == 0:
-        labels = ["Q{}".format(i) for i, _ in enumerate(query_names)]
-    elif args.max_label_width > 0:
-        labels = map(shorten_query_name, query_names)
-    ax.set_xticklabels(labels, rotation=30)
+    ax.set_xticklabels(query_names, rotation=30)
    # set only horizontal grid lines
    for line in ax.get_xgridlines():
        line.set_linestyle(' ')
@ -149,11 +158,11 @@ def main():
    ax.set_title(args.plot_title)
    # Draw logo or plot title
    if args.logo_path != None:
-        # TODO: improve the logo positioning
        im = plt.imread(get_sample_data(os.path.join(os.getcwd(),
                                                     args.logo_path)))
        plt.gcf().subplots_adjust(top=0.85)
-        newax = fig.add_axes([0.46, 0.85, 0.12, 0.15], anchor='N')
+        # magic numbers for logo size - DO NOT TOUCH!
+        newax = fig.add_axes([0.46, 0.85, 0.126, 0.15], anchor='N')
        newax.imshow(im)
        newax.axis('off')
    # Draw bars
--- a/tools/src/csv_to_snapshot/main.cpp
+++ b/tools/src/csv_to_snapshot/main.cpp
@ -210,7 +210,7 @@ void WriteNodeRow(const std::vector<Field> &fields,
        }
      }
      id = node_id_map.Insert(node_id);
-      properties["id"] = *id;
+      properties["id"] = node_id.id;
    } else if (field.type == "label") {
      for (const auto &label : utils::Split(value, FLAGS_array_delimiter)) {
        labels.emplace_back(utils::Trim(label));