memgraph/tests/manual/generate_snapshot.cpp
florijan ef5a381654 Snapshot generator improvements
Summary: Added support for optional properties, random integers (uniform distr) and random strings.

Reviewers: mislav.bradac, buda

Reviewed By: mislav.bradac

Subscribers: pullbot

Differential Revision: https://phabricator.memgraph.io/D885
2017-10-10 10:20:15 +02:00

324 lines
10 KiB
C++

#include <algorithm>
#include <experimental/optional>
#include <fstream>
#include <memory>
#include <random>
#include <unordered_map>
#include <vector>
#include <fmt/format.h>
#include <gflags/gflags.h>
#include <glog/logging.h>
#include <json/json.hpp>
#include "communication/bolt/v1/encoder/base_encoder.hpp"
#include "durability/file_writer_buffer.hpp"
#include "utils/assert.hpp"
#include "utils/string.hpp"
#include "utils/timer.hpp"
DEFINE_string(out, "", "Destination for the created snapshot file");
DEFINE_string(config, "", "Path to config JSON file");
/**
* This file contains the program for generating a snapshot based on a JSON
* definition. The JSON config has the following form:
*
* {
* "indexes" : ["Person.id", "Company.id"],
* "nodes" : [
* {
* "count" : 10000,
* "labels" : ["Person"],
* "properties" : {
* "id" : { "type" : "counter", "param": "Person.id" },
* "name" : { "type" : "randstring", "param" :
* { "type": "randint", "param" : [10, 20]}},
* "is_happy" : { "type" : "bernoulli", "param" : 0.2 }
* }
* },
* {
* "count" : 200,
* "labels" : ["Company"],
* "properties" : {
* "id" : { "type" : "counter", "param": "Company.id" },
* "name" : { "type" : "randstring", "param" :
* { "type": "randint", "param" : [10, 20]}},
* "description" : { "type" : "optional", "param" :
* [0.2, { "type" : "randstring", "param": 1024 }]}
* }
* }
* ],
* "edges" : [
* {
* "count" : 5000,
* "from" : "Person",
* "to" : "Company",
* "type" : "WORKS_IN"
* },
* {
* "count" : 20,
* "from" : "Person",
* "to" : "Company",
* "type" : "LIKES"
* }
* ]
* }
*/
// Utilities for writing to the snapshot file.
class Writer {
public:
/**
* Creates a writer.
*
* @param path - Path to the output file.
* @Param indexes - A list of (label, property) indexes to create, each in the
* "Label.property" form.
*/
Writer(const std::string &path, const std::vector<std::string> &indexes)
: buffer_(path), encoder_(buffer_) {
std::vector<std::string> indexes_flat;
for (const auto &index : indexes)
for (const auto &index_part : utils::Split(index, "."))
indexes_flat.emplace_back(index_part);
encoder_.WriteList(std::vector<query::TypedValue>(indexes_flat.begin(),
indexes_flat.end()));
}
int64_t WriteNode(
const std::vector<std::string> &labels,
std::unordered_map<std::string, query::TypedValue> properties) {
encoder_.WriteRAW(underlying_cast(communication::bolt::Marker::TinyStruct) +
3);
encoder_.WriteRAW(underlying_cast(communication::bolt::Signature::Node));
auto id = node_counter++;
encoder_.WriteInt(id);
encoder_.WriteList(
std::vector<query::TypedValue>{labels.begin(), labels.end()});
encoder_.WriteMap(properties);
return id;
}
int64_t WriteEdge(
const std::string &edge_type,
const std::unordered_map<std::string, query::TypedValue> properties,
int64_t bolt_id_from, int64_t bolt_id_to) {
encoder_.WriteRAW(underlying_cast(communication::bolt::Marker::TinyStruct) +
5);
encoder_.WriteRAW(
underlying_cast(communication::bolt::Signature::Relationship));
auto id = edge_counter_++;
encoder_.WriteInt(id);
encoder_.WriteInt(bolt_id_from);
encoder_.WriteInt(bolt_id_to);
encoder_.WriteString(edge_type);
encoder_.WriteMap(properties);
return id;
}
void Close() { buffer_.WriteSummary(node_counter, edge_counter_); }
private:
int64_t node_counter{0};
int64_t edge_counter_{0};
FileWriterBuffer buffer_;
communication::bolt::BaseEncoder<FileWriterBuffer> encoder_;
};
// Helper class for tracking info about the generated graph.
class GraphState {
public:
// Tracks that the given node has the given label.
void AddNode(const std::string &label, int64_t node_bolt_id) {
auto found = label_nodes_.find(label);
if (found == label_nodes_.end())
label_nodes_.emplace(label, std::vector<int64_t>{node_bolt_id});
else
found->second.emplace_back(node_bolt_id);
}
// Gets the ID of a random node that has the given label.
int64_t RandomNode(const std::string &label) {
auto found = label_nodes_.find(label);
permanent_assert(found != label_nodes_.end(), "Label not found");
return found->second[rand_(gen_) * found->second.size()];
}
private:
// Maps labels to node bolt_ids
std::unordered_map<std::string, std::vector<int64_t>> label_nodes_;
// Random generator
std::mt19937 gen_{std::random_device{}()};
std::uniform_real_distribution<> rand_{0.0, 1.0};
};
// Helper class for property value generation.
class ValueGenerator {
using json = nlohmann::json;
using TypedValue = query::TypedValue;
public:
// Generates the whole property map based on the given config.
std::unordered_map<std::string, query::TypedValue> MakeProperties(
const json &config) {
std::unordered_map<std::string, query::TypedValue> props;
if (config.is_null()) return props;
permanent_assert(config.is_object(), "Properties config must be a dict");
for (auto it = config.begin(); it != config.end(); it++) {
auto value = MakeValue(it.value());
if (value) props.emplace(it.key(), *value);
}
return props;
}
// Generates a single value based on the given config.
std::experimental::optional<TypedValue> MakeValue(const json &config) {
if (config.is_object()) {
const std::string &type = config["type"];
const auto &param = config["param"];
if (type == "primitive")
return Primitive(param);
else if (type == "counter")
return TypedValue(Counter(param));
else if (type == "optional")
return Optional(param);
else if (type == "bernoulli")
return TypedValue(Bernoulli(param));
else if (type == "randint")
return TypedValue(RandInt(param));
else if (type == "randstring")
return TypedValue(RandString(param));
else
permanent_fail("Unknown value type");
} else
return Primitive(config);
}
TypedValue Primitive(const json &config) {
if (config.is_string()) return config.get<std::string>();
if (config.is_number_integer()) return config.get<int64_t>();
if (config.is_number_float()) return config.get<double>();
if (config.is_boolean()) return config.get<bool>();
permanent_fail("Unsupported primitive type");
}
int64_t Counter(const std::string &name) {
auto found = counters_.find(name);
if (found == counters_.end()) {
counters_.emplace(name, 1);
return 0;
} else {
return found->second++;
}
}
int64_t RandInt(const json &range) {
permanent_assert(range.is_array() && range.size() == 2,
"RandInt value gen config must be a list with 2 elements");
auto from = MakeValue(range[0])->ValueInt();
auto to = MakeValue(range[1])->ValueInt();
permanent_assert(from < to,
"RandInt lower range must be lesser then upper range");
return (int64_t)(rand_(gen_) * (to - from)) + from;
}
std::string RandString(const json &length) {
static const char alphanum[] =
"0123456789"
"ABCDEFGHIJKLMNOPQRSTUVWXYZ"
"abcdefghijklmnopqrstuvwxyz";
int length_int = MakeValue(length)->ValueInt();
std::string r_val(length_int, 'a');
for (int i = 0; i < length_int; ++i)
r_val[i] = alphanum[(int64_t)(rand_(gen_) * (sizeof(alphanum) - 1))];
return r_val;
}
bool Bernoulli(double p) { return rand_(gen_) < p; }
std::experimental::optional<TypedValue> Optional(const json &config) {
permanent_assert(
config.is_array() && config.size() == 2,
"Optional value gen config must be a list with 2 elements");
return Bernoulli(config[0]) ? MakeValue(config[1])
: std::experimental::nullopt;
}
private:
std::mt19937 gen_{std::random_device{}()};
std::uniform_real_distribution<> rand_{0.0, 1.0};
std::unordered_map<std::string, int64_t> counters_;
};
nlohmann::json GetWithDefault(const nlohmann::json &object,
const std::string &key,
const nlohmann::json &default_value) {
const auto &found = object.find(key);
if (found == object.end()) return default_value;
return *found;
}
int main(int argc, char **argv) {
gflags::ParseCommandLineFlags(&argc, &argv, true);
google::InitGoogleLogging(argv[0]);
// Read the config JSON
nlohmann::json config;
{
std::ifstream config_file(FLAGS_config);
config_file >> config;
}
std::vector<std::string> indexes;
for (const auto &index : GetWithDefault(config, "indexes", {}))
indexes.push_back(index);
Writer writer(FLAGS_out, indexes);
GraphState state;
ValueGenerator value_generator;
// Create nodes
const auto &nodes_config = config["nodes"];
permanent_assert(
nodes_config.is_array() && nodes_config.size() > 0,
"Generator config must have 'nodes' array with at least one element");
for (const auto &node_config : config["nodes"]) {
permanent_assert(node_config.is_object(), "Node config must be a dict");
for (int i = 0; i < node_config["count"]; i++) {
const auto &labels_config = node_config["labels"];
permanent_assert(labels_config.is_array(),
"Must provide an array of node labels");
permanent_assert(node_config.size() > 0,
"Node labels array must contain at lest one element");
auto node_bolt_id = writer.WriteNode(
labels_config,
value_generator.MakeProperties(node_config["properties"]));
for (const auto &label : labels_config)
state.AddNode(label, node_bolt_id);
}
}
// Create edges
for (const auto &edge_config : config["edges"]) {
permanent_assert(edge_config.is_object(), "Edge config must be a dict");
const std::string &from = edge_config["from"];
const std::string &to = edge_config["to"];
for (int i = 0; i < edge_config["count"]; i++)
writer.WriteEdge(edge_config["type"],
value_generator.MakeProperties(
GetWithDefault(edge_config, "properties", nullptr)),
state.RandomNode(from), state.RandomNode(to));
}
writer.Close();
return 0;
}