From 880a563cf3160fe6b15edd1744d36abd0550abc1 Mon Sep 17 00:00:00 2001
From: florijan <florijan@memgraph.io>
Date: Fri, 6 Oct 2017 14:14:15 +0200
Subject: [PATCH] Generate snapshot added

Summary:
A cool new graph generator. It's really cool!

It is currently being used for generating the Ravelin graph with the following config:
```
{
  "indexes": ["Node.id"],
  "nodes": [
    {
      "count": 40000000,
      "labels": ["Node"],
      "properties": {
              "id" : {"type": "counter", "param" : "Node.id"},
              "fradulent" : {"type" : "bernoulli", "param" : 0.0005}
      }
    }
  ],
  "edges": [
    {
      "count": 80000000,
      "from": "Node",
      "to": "Node",
      "type": "Edge"
    }
  ]
}
```

Reviewers: buda, mislav.bradac

Reviewed By: mislav.bradac

Subscribers: pullbot

Differential Revision: https://phabricator.memgraph.io/D872
---
 .../bolt/v1/encoder/base_encoder.hpp          |  12 +-
 tests/manual/generate_snapshot.cpp            | 262 ++++++++++++++++++
 2 files changed, 271 insertions(+), 3 deletions(-)
 create mode 100644 tests/manual/generate_snapshot.cpp

diff --git a/src/communication/bolt/v1/encoder/base_encoder.hpp b/src/communication/bolt/v1/encoder/base_encoder.hpp
index 081163083..e842128bd 100644
--- a/src/communication/bolt/v1/encoder/base_encoder.hpp
+++ b/src/communication/bolt/v1/encoder/base_encoder.hpp
@@ -99,7 +99,13 @@ class BaseEncoder {
     for (auto &x : value) WriteTypedValue(x);
   }
 
-  void WriteMap(const std::map<std::string, query::TypedValue> &value) {
+  /**
+   * Writes a map value.
+   *
+   * @tparam TMap - an iterable of (std::string, TypedValue) pairs.
+   */
+  template <typename TMap>
+  void WriteMap(const TMap &value) {
     WriteTypeSize(value.size(), MarkerMap);
     for (auto &x : value) {
       WriteString(x.first);
@@ -169,8 +175,8 @@ class BaseEncoder {
     auto add_element = [&indices](auto &collection, const auto &element,
                                   int multiplier, int offset) {
       auto found = std::find(collection.begin(), collection.end(), element);
-      indices.emplace_back(
-          multiplier * (std::distance(collection.begin(), found) + offset));
+      indices.emplace_back(multiplier *
+                           (std::distance(collection.begin(), found) + offset));
       if (found == collection.end()) collection.emplace_back(element);
     };
 
diff --git a/tests/manual/generate_snapshot.cpp b/tests/manual/generate_snapshot.cpp
new file mode 100644
index 000000000..c479ef11d
--- /dev/null
+++ b/tests/manual/generate_snapshot.cpp
@@ -0,0 +1,262 @@
+#include <algorithm>
+#include <fstream>
+#include <memory>
+#include <random>
+#include <unordered_map>
+#include <vector>
+
+#include <fmt/format.h>
+#include <gflags/gflags.h>
+#include <glog/logging.h>
+#include <json/json.hpp>
+
+#include "communication/bolt/v1/encoder/base_encoder.hpp"
+#include "durability/file_writer_buffer.hpp"
+#include "utils/assert.hpp"
+#include "utils/string.hpp"
+#include "utils/timer.hpp"
+
+DEFINE_string(out, "", "Destination for the created snapshot file");
+DEFINE_string(config, "", "Path to config JSON file");
+
+/**
+ * This file contains the program for generating a snapshot based on a JSON
+ * definition. The JSON config has the following form:
+ *
+ * {
+ *   "indexes" : ["Person.id", "Company.id"],
+ *   "nodes" : [
+ *     {
+ *       "count" : 10000,
+ *       "labels" : ["Person"],
+ *       "properties" : {
+ *         "is_happy" :  { "type" : "bernoulli", "param" : 0.01 },
+ *         "id" : { "type" : "counter", "Person.id" }
+ *       }
+ *     },
+ *     {
+ *       "count" : 200,
+ *       "labels" : ["Company"]
+ *     }
+ *   ],
+ *   "edges" : [
+ *     {
+ *       "count" : 5000,
+ *       "from" : "Person",
+ *       "to" : "Company",
+ *       "type" : "WORKS_IN"
+ *     },
+ *     {
+ *       "count" : 20,
+ *       "from" : "Person",
+ *       "to" : "Company",
+ *       "type" : "LIKES"
+ *     }
+ *   ]
+ * }
+ */
+
+// Utilities for writing to the snapshot file.
+class Writer {
+ public:
+  /**
+   * Creates a writer.
+   *
+   * @param path - Path to the output file.
+   * @Param indexes - A list of (label, property) indexes to create, each in the
+   * "Label.property" form.
+   */
+  Writer(const std::string &path, const std::vector<std::string> &indexes)
+      : buffer_(path), encoder_(buffer_) {
+    std::vector<std::string> indexes_flat;
+    for (const auto &index : indexes)
+      for (const auto &index_part : utils::Split(index, "."))
+        indexes_flat.emplace_back(index_part);
+
+    encoder_.WriteList(std::vector<query::TypedValue>(indexes_flat.begin(),
+                                                      indexes_flat.end()));
+  }
+
+  int64_t WriteNode(
+      const std::vector<std::string> &labels,
+      std::unordered_map<std::string, query::TypedValue> properties) {
+    encoder_.WriteRAW(underlying_cast(communication::bolt::Marker::TinyStruct) +
+                      3);
+    encoder_.WriteRAW(underlying_cast(communication::bolt::Signature::Node));
+    auto id = node_counter++;
+    encoder_.WriteInt(id);
+    encoder_.WriteList(
+        std::vector<query::TypedValue>{labels.begin(), labels.end()});
+    encoder_.WriteMap(properties);
+    return id;
+  }
+
+  int64_t WriteEdge(
+      const std::string &edge_type,
+      const std::unordered_map<std::string, query::TypedValue> properties,
+      int64_t bolt_id_from, int64_t bolt_id_to) {
+    encoder_.WriteRAW(underlying_cast(communication::bolt::Marker::TinyStruct) +
+                      5);
+    encoder_.WriteRAW(
+        underlying_cast(communication::bolt::Signature::Relationship));
+    auto id = edge_counter_++;
+    encoder_.WriteInt(id);
+    encoder_.WriteInt(bolt_id_from);
+    encoder_.WriteInt(bolt_id_to);
+    encoder_.WriteString(edge_type);
+    encoder_.WriteMap(properties);
+    return id;
+  }
+
+  void Close() { buffer_.WriteSummary(node_counter, edge_counter_); }
+
+ private:
+  int64_t node_counter{0};
+  int64_t edge_counter_{0};
+  FileWriterBuffer buffer_;
+  communication::bolt::BaseEncoder<FileWriterBuffer> encoder_;
+};
+
+// Helper class for tracking info about the generated graph.
+class GraphState {
+ public:
+  // Tracks that the given node has the given label.
+  void AddNode(const std::string &label, int64_t node_bolt_id) {
+    auto found = label_nodes_.find(label);
+    if (found == label_nodes_.end())
+      label_nodes_.emplace(label, std::vector<int64_t>{node_bolt_id});
+    else
+      found->second.emplace_back(node_bolt_id);
+  }
+
+  // Gets the ID of a random node that has the given label.
+  int64_t RandomNode(const std::string &label) {
+    auto found = label_nodes_.find(label);
+    permanent_assert(found != label_nodes_.end(), "Label not found");
+    return found->second[rand_(gen_) * found->second.size()];
+  }
+
+ private:
+  // Maps labels to node bolt_ids
+  std::unordered_map<std::string, std::vector<int64_t>> label_nodes_;
+
+  // Random generator
+  std::mt19937 gen_{std::random_device{}()};
+  std::uniform_real_distribution<> rand_{0.0, 1.0};
+};
+
+// Helper class for property value generation.
+class ValueGenerator {
+ public:
+  // Generates the whole property map based on the given config.
+  std::unordered_map<std::string, query::TypedValue> MakeProperties(
+      const nlohmann::json &config) {
+    std::unordered_map<std::string, query::TypedValue> props;
+    if (config.is_null()) return props;
+
+    permanent_assert(config.is_object(), "Properties config must be a dict");
+    for (auto it = config.begin(); it != config.end(); it++) {
+      if (it.value().is_object())
+        props.emplace(it.key(), MakeValue(it.value()));
+      else
+        permanent_fail("Unsupported value type");
+    }
+    return props;
+  }
+
+  // Generates a single value based on the given config.
+  query::TypedValue MakeValue(const nlohmann::json &config) {
+    permanent_assert(config.is_object(),
+                     "Random value gen config must be a dict");
+    const std::string &type = config["type"];
+    const auto &param = config["param"];
+    if (type == "bernoulli") {
+      return Bernoulli(param);
+    } else if (type == "counter")
+      return Counter(param);
+    else
+      permanent_fail("Unknown distribution");
+  }
+
+  int64_t Counter(const std::string &name) {
+    auto found = counters_.find(name);
+    if (found == counters_.end()) {
+      counters_.emplace(name, 1);
+      return 0;
+    } else {
+      return found->second++;
+    }
+  }
+
+  bool Bernoulli(double p) { return rand_(gen_) < p; }
+
+ private:
+  std::mt19937 gen_{std::random_device{}()};
+  std::uniform_real_distribution<> rand_{0.0, 1.0};
+  std::unordered_map<std::string, int64_t> counters_;
+};
+
+nlohmann::json GetWithDefault(const nlohmann::json &object,
+                              const std::string &key,
+                              const nlohmann::json &default_value) {
+  const auto &found = object.find(key);
+  if (found == object.end()) return default_value;
+  return *found;
+}
+
+int main(int argc, char **argv) {
+  gflags::ParseCommandLineFlags(&argc, &argv, true);
+  google::InitGoogleLogging(argv[0]);
+
+  // Read the config JSON
+  nlohmann::json config;
+  {
+    std::ifstream config_file(FLAGS_config);
+    config_file >> config;
+  }
+
+  std::vector<std::string> indexes;
+  for (const auto &index : GetWithDefault(config, "indexes", {}))
+    indexes.push_back(index);
+  Writer writer(FLAGS_out, indexes);
+  GraphState state;
+  ValueGenerator value_generator;
+
+  // Create nodes
+  const auto &nodes_config = config["nodes"];
+  permanent_assert(
+      nodes_config.is_array() && nodes_config.size() > 0,
+      "Generator config must have 'nodes' array with at least one element");
+  for (const auto &node_config : config["nodes"]) {
+    permanent_assert(node_config.is_object(), "Node config must be a dict");
+
+    for (int i = 0; i < node_config["count"]; i++) {
+      const auto &labels_config = node_config["labels"];
+      permanent_assert(labels_config.is_array(),
+                       "Must provide an array of node labels");
+      permanent_assert(node_config.size() > 0,
+                       "Node labels array must contain at lest one element");
+      auto node_bolt_id = writer.WriteNode(
+          labels_config,
+          value_generator.MakeProperties(node_config["properties"]));
+
+      for (const auto &label : labels_config)
+        state.AddNode(label, node_bolt_id);
+    }
+  }
+
+  // Create edges
+  for (const auto &edge_config : config["edges"]) {
+    permanent_assert(edge_config.is_object(), "Edge config must be a dict");
+    const std::string &from = edge_config["from"];
+    const std::string &to = edge_config["to"];
+    for (int i = 0; i < edge_config["count"]; i++)
+      writer.WriteEdge(edge_config["type"],
+                       value_generator.MakeProperties(
+                           GetWithDefault(edge_config, "properties", nullptr)),
+                       state.RandomNode(from), state.RandomNode(to));
+  }
+
+  writer.Close();
+  return 0;
+}