Refactor snapshot generators to reduce code duplication

Reviewers: florijan, dgleich, mculinovic

Reviewed By: dgleich

Subscribers: pullbot

Differential Revision: https://phabricator.memgraph.io/D1311
This commit is contained in:
Marin Tomic 2018-03-22 16:33:03 +01:00
parent a39602093d
commit b9c5af2568
9 changed files with 644 additions and 692 deletions

View File

@ -15,6 +15,10 @@ namespace fs = std::experimental::filesystem;
namespace durability { namespace durability {
// Snapshot layout is described in durability/version.hpp
static_assert(durability::kVersion == 5,
"Wrong snapshot version, please update!");
namespace { namespace {
bool Encode(const fs::path &snapshot_file, database::GraphDb &db, bool Encode(const fs::path &snapshot_file, database::GraphDb &db,
database::GraphDbAccessor &dba) { database::GraphDbAccessor &dba) {
@ -80,7 +84,7 @@ bool Encode(const fs::path &snapshot_file, database::GraphDb &db,
return true; return true;
} }
// Removes snaposhot files so that only `max_retained` latest ones are kept. If // Removes snapshot files so that only `max_retained` latest ones are kept. If
// `max_retained == -1`, all the snapshots are retained. // `max_retained == -1`, all the snapshots are retained.
void RemoveOldSnapshots(const fs::path &snapshot_dir, int max_retained) { void RemoveOldSnapshots(const fs::path &snapshot_dir, int max_retained) {
if (max_retained == -1) return; if (max_retained == -1) return;

View File

@ -7,6 +7,32 @@ namespace durability {
constexpr std::array<uint8_t, 4> kMagicNumber{{'M', 'G', 's', 'n'}}; constexpr std::array<uint8_t, 4> kMagicNumber{{'M', 'G', 's', 'n'}};
// The current default version of snapshot and WAL enconding / decoding. // The current default version of snapshot and WAL encoding / decoding.
constexpr int64_t kVersion{5}; constexpr int64_t kVersion{5};
// Snapshot format (version 5):
// 1) Magic number + snapshot version
// 2) Distributed worker ID
//
// The following two entries indicate the starting points for generating new
// vertex/edge IDs in the DB. They are important when there are vertices/edges
// that were moved to another worker (in distributed Memgraph).
// 3) Vertex generator ID
// 4) Edge generator ID
//
// 5) A list of label+property indices.
//
// The following two entries are required when recovering from snapshot combined
// with WAL to determine record visibility.
// 6) Transactional ID of the snapshooter
// 7) Transactional snapshot of the snapshooter
//
// We must inline edges with nodes because some edges might be stored on other
// worker (edges are always stored only on the worker of the edge source).
// 8) Bolt encoded nodes + inlined edges (edge address, other endpoint address
// and edge type)
// 9) Bolt encoded edges
//
// 10) Snapshot summary (number of nodes, number of edges, hash)
} // namespace durability } // namespace durability

View File

@ -1,19 +1,7 @@
{ {
"indexes" : ["Card.id", "Pos.id", "Transaction.fraud_reported"], "cards_per_worker" : 10000,
"nodes" : [ "pos_per_worker" : 1000,
{ "transactions_per_worker" : 50000,
"count_per_worker" : 10000,
"label" : "Card"
},
{
"count_per_worker" : 1000,
"label" : "Pos"
},
{
"count_per_worker" : 50000,
"label" : "Transaction"
}
],
"compromised_pos_probability" : 0.2, "compromised_pos_probability" : 0.2,
"fraud_reported_probability" : 0.1, "fraud_reported_probability" : 0.1,
"hop_probability" : 0.1 "hop_probability" : 0.1

View File

@ -11,4 +11,4 @@ fi
NUM_MACHINES="$( cat card_fraud.py | grep -m1 "NUM_MACHINES" | tail -c 2 )" NUM_MACHINES="$( cat card_fraud.py | grep -m1 "NUM_MACHINES" | tail -c 2 )"
../../../build_release/tests/manual/card_fraud_generate_snapshot --config config.json --num-workers $NUM_MACHINES --dir $output_dir ../../../build_release/tests/manual/card_fraud_generate_snapshot --config config.json --num-workers $NUM_MACHINES --dir $output_dir

View File

@ -18,7 +18,10 @@
#include "utils/string.hpp" #include "utils/string.hpp"
#include "utils/timer.hpp" #include "utils/timer.hpp"
DEFINE_string(num_workers, "1", #include "snapshot_generation/graph_state.hpp"
#include "snapshot_generation/snapshot_writer.hpp"
DEFINE_int32(num_workers, 1,
"Number of distributed workers (including master)"); "Number of distributed workers (including master)");
DEFINE_string(dir, "tmp", DEFINE_string(dir, "tmp",
"Directory for storing workers durability directories."); "Directory for storing workers durability directories.");
@ -26,262 +29,72 @@ DEFINE_string(config, "", "Path to config JSON file");
/** /**
* Config file should defined as follows. * Config file should defined as follows.
*
*{
* "indexes" : ["Card.id", "Pos.id", "Transaction.fraud_reported"],
* "nodes" : [
* { * {
* "count_per_worker" : 10, * "cards_per_worker" : 10000,
* "label" : "Card" * "pos_per_worker" : 1000,
* }, * "transactions_per_worker" : 50000,
* {
* "count_per_worker" : 10,
* "label" : "Pos"
* },
* {
* "count_per_worker" : 20,
* "label" : "Transaction"
* }
* ],
* "compromised_pos_probability" : 0.2, * "compromised_pos_probability" : 0.2,
* "fraud_reported_probability" : 0.1, * "fraud_reported_probability" : 0.1,
* "hop_percentage" : 0.1 * "hop_probability" : 0.1
*} * }
*/ */
namespace fs = std::experimental::filesystem; using namespace snapshot_generation;
using nlohmann::json;
/// Helper class for tracking info about the generated graph. json BuildGenericConfig(const json &config) {
class GraphState { json indices = json::array(
typedef std::unordered_map<std::string, std::vector<gid::Gid>> LabelNodes; {"Card.id", "Pos.id", "Transaction.id", "Transaction.fraud_reported"});
public: json cards;
GraphState(int num_workers) cards["labels"] = {"Card"};
: worker_nodes(num_workers), cards["count_per_worker"] = config["cards_per_worker"];
compromised_pos(num_workers),
compromised_card(num_workers),
out_edges(num_workers),
in_edges(num_workers) {}
// Gets the ID of a random node on worker that has the given label. cards["properties"] = json::object();
gid::Gid RandomNode(int worker_id, const std::string &label) { cards["properties"].push_back(
auto &label_nodes = worker_nodes[worker_id]; {"id", {{"type", "counter"}, {"param", "Card.id"}}});
auto found = label_nodes.find(label); cards["properties"].push_back(
CHECK(found != label_nodes.end()) << "Label not found"; {"compromised", {{"type", "primitive"}, {"param", false}}});
return found->second[rand_(gen_) * found->second.size()];
}
bool IsCompromisedPos(int worker_id, gid::Gid pos_id) { json pos;
std::unordered_set<gid::Gid> &compromised = compromised_pos[worker_id]; pos["labels"] = {"Pos"};
return compromised.find(pos_id) != compromised.end(); pos["count_per_worker"] = config["pos_per_worker"];
}
bool IsCompromisedCard(int worker_id, gid::Gid card_id) { pos["properties"] = json::object();
std::unordered_set<gid::Gid> &compromised = compromised_card[worker_id]; pos["properties"].push_back(
return compromised.find(card_id) != compromised.end(); {"id", {{"type", "counter"}, {"param", "Pos.id"}}});
} pos["properties"].push_back(
{"compromised",
{{"type", "bernoulli"},
{"param", config["compromised_pos_probability"]}}});
pos["properties"].push_back(
{"connected_frauds", {{"type", "primitive"}, {"param", 0}}});
struct Edge { json txs;
enum class Type { USING, AT }; txs["labels"] = {"Transaction"};
gid::Gid gid; txs["count_per_worker"] = config["transactions_per_worker"];
storage::VertexAddress from;
storage::VertexAddress to;
Type type;
};
void AddEdge(gid::Gid edge_gid, storage::VertexAddress from, txs["properties"] = json::object();
storage::VertexAddress to, Edge::Type type) { txs["properties"].push_back(
out_edges[from.worker_id()][from.gid()].emplace_back(edges.size()); {"id", {{"type", "counter"}, {"param", "Transaction.id"}}});
in_edges[to.worker_id()][to.gid()].emplace_back(edges.size()); txs["properties"].push_back(
edges.emplace_back(Edge{edge_gid, from, to, type}); {"fraud_reported", {{"type", "primitive"}, {"param", false}}});
}
// Maps worker node labels to node bolt_ids. json edges;
std::vector<LabelNodes> worker_nodes; edges = json::array();
edges.push_back({{"kind", "unique"},
{"from", "Transaction"},
{"to", "Card"},
{"type", "Using"},
{"hop_probability", config["hop_probability"]}});
edges.push_back({{"kind", "unique"},
{"from", "Transaction"},
{"to", "Pos"},
{"type", "At"},
{"hop_probability", config["hop_probability"]}});
// Compromised cards and pos. return json::object(
std::vector<std::unordered_set<gid::Gid>> compromised_pos; {{"indexes", indices}, {"nodes", {cards, pos, txs}}, {"edges", edges}});
std::vector<std::unordered_set<gid::Gid>> compromised_card;
// In/Out Vertex Edges.
std::vector<Edge> edges;
std::vector<std::unordered_map<gid::Gid, std::vector<int>>> out_edges;
std::vector<std::unordered_map<gid::Gid, std::vector<int>>> in_edges;
// Random generator
std::mt19937 gen_{std::random_device{}()};
std::uniform_real_distribution<> rand_{0.0, 1.0};
};
// Utilities for writing to the snapshot file.
// Snapshot file has the following contents in order:
// 1) Magic number.
// 2) Worker Id
// 3) Internal Id of vertex generator
// 4) Internal Id of edge generator
// 5) Transaction ID of the snapshooter. When generated set to 0.
// 6) Transactional snapshot of the snapshoter. When the snapshot is
// generated it's an empty list.
// 7) List of label+property index.
// 8) All nodes, sequentially, but not encoded as a list.
// 9) All relationships, sequentially, but not encoded as a list.
// 10) Summary with node count, relationship count and hash digest.
class Writer {
using DecodedValue = communication::bolt::DecodedValue;
const std::string kEdgeUsing = "Using";
const std::string kEdgeAt = "At";
public:
Writer(const std::string &path, int worker_id) : buffer_(path) {
// 1) Magic number
encoder_.WriteRAW(durability::kMagicNumber.data(),
durability::kMagicNumber.size());
encoder_.WriteTypedValue(durability::kVersion);
// 2) WorkerId - important for distributed storage
worker_id_ = worker_id;
encoder_.WriteInt(worker_id_);
// The following two entries indicate the starting points for generating new
// Vertex/Edge IDs in the DB. They are important when there are
// vertices/edges that were moved to another worker (in distributed
// Memgraph), so be careful! In this case we don't have to worry
// because we'll never move vertices/edges between workers.
// 3) ID of the vertex generator.
encoder_.WriteInt(0);
// 4) ID of the edge generator.
encoder_.WriteInt(0);
// 5) Transactional ID of the snapshooter.
encoder_.WriteInt(0);
// 6) Transactional Snapshot is an empty list of transaction IDs.
encoder_.WriteList(std::vector<query::TypedValue>{});
}
template <typename TValue>
void WriteList(const std::vector<TValue> &list) {
encoder_.WriteList(
std::vector<query::TypedValue>(list.begin(), list.end()));
}
void WriteNode(gid::Gid id, const std::string &label,
std::unordered_map<std::string, DecodedValue> &properties,
const std::vector<GraphState::Edge> &edges,
const std::vector<int> &out_edges,
const std::vector<int> &in_edges) {
encoder_.WriteRAW(underlying_cast(communication::bolt::Marker::TinyStruct) +
3);
encoder_.WriteRAW(underlying_cast(communication::bolt::Signature::Node));
encoder_.WriteInt(id);
encoder_.WriteList(std::vector<query::TypedValue>{label});
encoder_.WriteMap(properties);
encoder_.WriteInt(in_edges.size());
for (auto edge_num : in_edges) {
auto &edge = edges[edge_num];
auto edge_addr = storage::EdgeAddress(edge.gid, edge.from.worker_id());
encoder_.WriteInt(edge_addr.raw());
encoder_.WriteInt(edge.from.raw());
encoder_.WriteString(
edge.type == GraphState::Edge::Type::USING ? kEdgeUsing : kEdgeAt);
}
encoder_.WriteInt(out_edges.size());
for (auto edge_num : out_edges) {
auto &edge = edges[edge_num];
auto edge_addr = storage::EdgeAddress(edge.gid, edge.from.worker_id());
encoder_.WriteInt(edge_addr.raw());
encoder_.WriteInt(edge.to.raw());
encoder_.WriteString(
edge.type == GraphState::Edge::Type::USING ? kEdgeUsing : kEdgeAt);
}
}
void WriteEdge(const GraphState::Edge &edge) {
encoder_.WriteRAW(underlying_cast(communication::bolt::Marker::TinyStruct) +
5);
encoder_.WriteRAW(
underlying_cast(communication::bolt::Signature::Relationship));
encoder_.WriteInt(edge.gid);
encoder_.WriteInt(edge.from.gid());
encoder_.WriteInt(edge.to.gid());
encoder_.WriteString(edge.type == GraphState::Edge::Type::USING ? kEdgeUsing
: kEdgeAt);
// Write edges to snapshot.
std::unordered_map<std::string, communication::bolt::DecodedValue>
empty_props{};
encoder_.WriteMap(empty_props);
}
void Close(uint64_t node_count, uint64_t edge_count, uint64_t hops_count) {
// 10) Summary with node count, relationship count and hash digest.
buffer_.WriteValue(node_count);
buffer_.WriteValue(edge_count);
buffer_.WriteValue(buffer_.hash());
buffer_.Close();
// Log summary
LOG(INFO) << fmt::format("-- Summary for worker: {}", worker_id_);
LOG(INFO) << fmt::format("---- Total nodes: {}", node_count);
LOG(INFO) << fmt::format("---- Total edges: {}", edge_count);
LOG(INFO) << fmt::format("---- Hop edges: {}", hops_count);
}
private:
HashedFileWriter buffer_;
durability::SnapshotEncoder<HashedFileWriter> encoder_{buffer_};
int worker_id_;
};
// Helper class for property value generation.
class ValueGenerator {
using json = nlohmann::json;
using DecodedValue = communication::bolt::DecodedValue;
public:
std::unordered_map<std::string, DecodedValue> MakePosProperties(
bool compromised, int worker_id) {
std::unordered_map<std::string, DecodedValue> props;
props.emplace("id", DecodedValue(Counter("Pos.id")));
props.emplace("worker_id", DecodedValue(worker_id));
props.emplace("compromised", DecodedValue(compromised));
props.emplace("connected_frauds", DecodedValue(0));
return props;
}
std::unordered_map<std::string, DecodedValue> MakeTxProperties(
bool fraud_reported, int worker_id, int id) {
std::unordered_map<std::string, DecodedValue> props;
props.emplace("id", DecodedValue(id));
props.emplace("worker_id", DecodedValue(worker_id));
props.emplace("fraud_reported", DecodedValue(fraud_reported));
return props;
}
std::unordered_map<std::string, DecodedValue> MakeCardProperties(
bool compromised, int worker_id) {
std::unordered_map<std::string, DecodedValue> props;
props.emplace("id", DecodedValue(Counter("Card.id")));
props.emplace("worker_id", DecodedValue(worker_id));
props.emplace("compromised", DecodedValue(compromised));
return props;
}
int64_t Counter(const std::string &name) { return counters_[name]++; }
bool Bernoulli(double p) { return rand_(gen_) < p; }
private:
std::mt19937 gen_{std::random_device{}()};
std::uniform_real_distribution<> rand_{0.0, 1.0};
std::unordered_map<std::string, int64_t> counters_;
};
nlohmann::json GetWithDefault(const nlohmann::json &object,
const std::string &key,
const nlohmann::json &default_value) {
const auto &found = object.find(key);
if (found == object.end()) return default_value;
return *found;
} }
int main(int argc, char **argv) { int main(int argc, char **argv) {
@ -294,185 +107,44 @@ int main(int argc, char **argv) {
config_file >> config; config_file >> config;
} }
// Vector IDs. GraphState state =
int num_workers = std::atoi(FLAGS_num_workers.c_str()); BuildFromConfig(FLAGS_num_workers, BuildGenericConfig(config));
std::vector<int> worker_ids(num_workers);
std::iota(worker_ids.begin(), worker_ids.end(), 0);
// Generated node and edge counters. // TODO(mtomic): this is not currently used in the demo, maybe remove?
std::vector<int> nodes_count(num_workers, 0);
std::vector<int> hops_count(num_workers, 0);
GraphState state(num_workers); // A card is compromised if it was used on a compromised POS.
ValueGenerator value_generator; for (auto &tx_gid : state.NodesWithLabel("Transaction")) {
auto &node = state.GetNode(tx_gid);
const std::string kLabelTransaction = "Transaction"; auto &edge1 = state.GetEdge(node.out_edges[0]);
const std::string kLabelCard = "Card"; auto &edge2 = state.GetEdge(node.out_edges[1]);
const std::string kLabelPos = "Pos"; if (edge1.type != "Using") {
const fs::path kSnapshotDir = "snapshots"; std::swap(edge1, edge2);
double compromised_pos_probability = config["compromised_pos_probability"];
double fraud_reported_probability = config["fraud_reported_probability"];
double hop_probability = config["hop_probability"];
LOG(INFO) << "Creating snapshots with config: ";
LOG(INFO) << fmt::format("-- Compromised Pos probability: {}",
compromised_pos_probability);
LOG(INFO) << fmt::format("-- Hop probability : {}", hop_probability);
// Allocate ids for nodes and write them to state.
LOG(INFO) << "Creating nodes...";
for (auto worker_id : worker_ids) {
gid::Generator node_generator{worker_id};
const auto &nodes_config = config["nodes"];
CHECK(nodes_config.is_array() && nodes_config.size() > 0)
<< "Generator config must have 'nodes' array with at least one element";
for (const auto &node_config : config["nodes"]) {
CHECK(node_config.is_object()) << "Node config must be a dict";
const auto &label = node_config["label"];
CHECK(label.is_string() && !label.empty())
<< "Node must have label specified";
for (int i = 0; i < node_config["count_per_worker"]; i++) {
auto node_gid = node_generator.Next(std::experimental::nullopt);
if (label == kLabelPos &&
value_generator.Bernoulli(compromised_pos_probability)) {
// Write compromised to state.
state.compromised_pos[worker_id].insert(node_gid);
} }
// Write node to state. DCHECK(edge1.type == "Using" && edge2.type == "At");
state.worker_nodes[worker_id][label].emplace_back(node_gid);
if (state.GetNode(edge2.to).props.at("compromised").Value<bool>()) {
state.GetNode(edge1.to).props.at("compromised") = true;
} }
} }
nodes_count[worker_id] = node_generator.LocalCount();
// Transaction is reported as fraudulent with some probability if a
// compromised card was used.
std::mt19937 gen{std::random_device{}()};
std::uniform_real_distribution<double> dist(0, 1);
for (auto &tx_gid : state.NodesWithLabel("Transaction")) {
auto &node = state.GetNode(tx_gid);
auto &edge = state.GetEdge(node.out_edges[0]);
DCHECK(edge.type == "Using");
if (state.GetNode(edge.to).props.at("compromised").Value<bool>()) {
node.props.at("fraud_reported") =
node.props.at("fraud_reported").Value<bool>() ||
(dist(gen) < config["fraud_reported_probability"]);
} }
LOG(INFO) << "Creating nodes...DONE";
std::random_device random_device;
std::mt19937 engine{random_device()};
// Create edges for each transaction.
LOG(INFO) << "Creating edges...";
for (auto &worker_id : worker_ids) {
gid::Generator edge_generator{worker_id};
auto filter = [worker_id, num_workers](const int other_worker) {
if (num_workers == 1) return true;
return other_worker != worker_id;
};
std::vector<int> hop_workers;
std::copy_if(worker_ids.begin(), worker_ids.end(),
std::back_inserter(hop_workers), filter);
std::uniform_int_distribution<int> dist(0, hop_workers.size() - 1);
// Create and write edges to state.
// Write compromised cards to state.
const auto &transactions = state.worker_nodes[worker_id][kLabelTransaction];
for (auto &transaction_id : transactions) {
int card_worker_id = worker_id;
if (value_generator.Bernoulli(hop_probability)) {
card_worker_id = hop_workers[dist(engine)];
++hops_count[worker_id];
}
auto card_id = state.RandomNode(card_worker_id, kLabelCard);
int pos_worker_id = worker_id;
if (value_generator.Bernoulli(hop_probability)) {
pos_worker_id = hop_workers[dist(engine)];
++hops_count[worker_id];
}
auto pos_id = state.RandomNode(pos_worker_id, kLabelPos);
auto edge_using_id = edge_generator.Next(std::experimental::nullopt);
state.AddEdge(edge_using_id,
storage::VertexAddress(transaction_id, worker_id),
storage::VertexAddress(card_id, card_worker_id),
GraphState::Edge::Type::USING);
auto edge_at_id = edge_generator.Next(std::experimental::nullopt);
state.AddEdge(edge_at_id,
storage::VertexAddress(transaction_id, worker_id),
storage::VertexAddress(pos_id, pos_worker_id),
GraphState::Edge::Type::AT);
if (state.IsCompromisedPos(pos_worker_id, pos_id)) {
state.compromised_card[card_worker_id].insert(card_id);
}
}
}
LOG(INFO) << "Creating edges...DONE";
// Write snapshot files.
LOG(INFO) << "Writing snapshots...";
for (int worker_id = 0; worker_id < num_workers; ++worker_id) {
const fs::path durability_dir =
FLAGS_dir / fs::path("worker_" + std::to_string(worker_id));
if (!durability::EnsureDir(durability_dir / kSnapshotDir)) {
LOG(ERROR) << "Unable to create durability directory!";
exit(0);
}
const auto snapshot_file =
durability::MakeSnapshotPath(durability_dir, worker_id);
Writer writer(snapshot_file, worker_id);
// Write indexes to snapshot.
std::vector<std::string> indexes;
for (const auto &index : GetWithDefault(config, "indexes", {}))
for (const auto &index_part : utils::Split(index, "."))
indexes.push_back(index_part);
writer.WriteList(indexes);
// Write Cards to snapshot.
for (auto &card_id : state.worker_nodes[worker_id][kLabelCard]) {
bool is_compromised = state.IsCompromisedCard(worker_id, card_id);
auto props =
value_generator.MakeCardProperties(is_compromised, worker_id);
DCHECK(state.out_edges[worker_id][card_id].size() == 0);
writer.WriteNode(card_id, kLabelCard, props, state.edges,
state.out_edges[worker_id][card_id],
state.in_edges[worker_id][card_id]);
} }
// Write Pos to snapshot. WriteToSnapshot(state, FLAGS_dir);
for (auto &pos_id : state.worker_nodes[worker_id][kLabelPos]) {
bool is_compromised = state.IsCompromisedPos(worker_id, pos_id);
auto props = value_generator.MakePosProperties(is_compromised, worker_id);
DCHECK(state.out_edges[worker_id][pos_id].size() == 0);
writer.WriteNode(pos_id, kLabelPos, props, state.edges,
state.out_edges[worker_id][pos_id],
state.in_edges[worker_id][pos_id]);
}
// Write Transactions to snapshot.
int transaction_id = 0;
for (auto &tx_id : state.worker_nodes[worker_id][kLabelTransaction]) {
const auto &edges = state.edges;
const auto &out_edges = state.out_edges[worker_id][tx_id];
DCHECK(out_edges.size() == 2);
DCHECK(edges[out_edges[0]].type == GraphState::Edge::Type::USING);
DCHECK(edges[out_edges[1]].type == GraphState::Edge::Type::AT);
DCHECK(state.in_edges[worker_id][tx_id].size() == 0);
bool fraud_reported = false;
if (state.IsCompromisedCard(edges[out_edges[0]].to.worker_id(),
edges[out_edges[0]].to.gid())) {
fraud_reported = value_generator.Bernoulli(fraud_reported_probability);
}
auto props = value_generator.MakeTxProperties(
fraud_reported, worker_id, transaction_id * num_workers + worker_id);
writer.WriteNode(tx_id, kLabelTransaction, props, state.edges,
state.out_edges[worker_id][tx_id],
state.in_edges[worker_id][tx_id]);
transaction_id++;
}
// Write edges to snapshot.
int edge_count{0};
for (auto &edge : state.edges) {
if (edge.from.worker_id() == worker_id) {
writer.WriteEdge(edge);
++edge_count;
}
}
writer.Close(nodes_count[worker_id], edge_count, hops_count[worker_id]);
}
LOG(INFO) << "Writing snapshots...DONE";
return 0; return 0;
} }

View File

@ -18,9 +18,15 @@
#include "utils/string.hpp" #include "utils/string.hpp"
#include "utils/timer.hpp" #include "utils/timer.hpp"
DEFINE_string(out, "", "Destination for the created snapshot file"); #include "snapshot_generation/graph_state.hpp"
#include "snapshot_generation/snapshot_writer.hpp"
DEFINE_int32(num_workers, 1, "number of workers");
DEFINE_string(out, "tmp", "Destination for the created snapshot file");
DEFINE_string(config, "", "Path to config JSON file"); DEFINE_string(config, "", "Path to config JSON file");
using namespace snapshot_generation;
/** /**
* This file contains the program for generating a snapshot based on a JSON * This file contains the program for generating a snapshot based on a JSON
* definition. The JSON config has the following form: * definition. The JSON config has the following form:
@ -29,7 +35,7 @@ DEFINE_string(config, "", "Path to config JSON file");
* "indexes" : ["Person.id", "Company.id"], * "indexes" : ["Person.id", "Company.id"],
* "nodes" : [ * "nodes" : [
* { * {
* "count" : 10000, * "count_per_worker" : 10000,
* "labels" : ["Person"], * "labels" : ["Person"],
* "properties" : { * "properties" : {
* "id" : { "type" : "counter", "param": "Person.id" }, * "id" : { "type" : "counter", "param": "Person.id" },
@ -39,7 +45,7 @@ DEFINE_string(config, "", "Path to config JSON file");
* } * }
* }, * },
* { * {
* "count" : 200, * "count_per_worker" : 200,
* "labels" : ["Company"], * "labels" : ["Company"],
* "properties" : { * "properties" : {
* "id" : { "type" : "counter", "param": "Company.id" }, * "id" : { "type" : "counter", "param": "Company.id" },
@ -52,228 +58,25 @@ DEFINE_string(config, "", "Path to config JSON file");
* ], * ],
* "edges" : [ * "edges" : [
* { * {
* "count" : 5000, * "kind": "unique",
* "from" : "Person", * "from" : "Person",
* "to" : "Company", * "to" : "Company",
* "type" : "WORKS_IN" * "type" : "WORKS_IN",
* "hop_probability": 0.05
* }, * },
* { * {
* "kind": "random",
* "count" : 20, * "count" : 20,
* "from" : "Person", * "from" : "Person",
* "to" : "Company", * "to" : "Company",
* "type" : "LIKES" * "type" : "LIKES",
* "hop_probability": 0.1
* } * }
* ] * ]
* } * }
*/ */
// Utilities for writing to the snapshot file.
class Writer {
public:
Writer(const std::string &path) : buffer_(path) {
encoder_.WriteRAW(durability::kMagicNumber.data(),
durability::kMagicNumber.size());
encoder_.WriteTypedValue(durability::kVersion);
// ID of the vertex generator.
encoder_.WriteInt(0);
// ID of the edge generator.
encoder_.WriteInt(0);
// Transactional ID of the snapshooter.
encoder_.WriteInt(0);
// Transactional Snapshot is an empty list of transaction IDs.
encoder_.WriteList(std::vector<query::TypedValue>{});
}
template <typename TValue>
void WriteList(const std::vector<TValue> &list) {
encoder_.WriteList(
std::vector<query::TypedValue>(list.begin(), list.end()));
}
int64_t WriteNode(
const std::vector<std::string> &labels,
std::unordered_map<std::string, query::TypedValue> properties) {
encoder_.WriteRAW(underlying_cast(communication::bolt::Marker::TinyStruct) +
3);
encoder_.WriteRAW(underlying_cast(communication::bolt::Signature::Node));
auto id = node_generator_.Next(std::experimental::nullopt);
encoder_.WriteInt(id);
encoder_.WriteList(
std::vector<query::TypedValue>{labels.begin(), labels.end()});
encoder_.WriteMap(properties);
return id;
}
int64_t WriteEdge(
const std::string &edge_type,
const std::unordered_map<std::string, query::TypedValue> properties,
int64_t bolt_id_from, int64_t bolt_id_to) {
encoder_.WriteRAW(underlying_cast(communication::bolt::Marker::TinyStruct) +
5);
encoder_.WriteRAW(
underlying_cast(communication::bolt::Signature::Relationship));
auto id = edge_generator_.Next(std::experimental::nullopt);
encoder_.WriteInt(id);
encoder_.WriteInt(bolt_id_from);
encoder_.WriteInt(bolt_id_to);
encoder_.WriteString(edge_type);
encoder_.WriteMap(properties);
return id;
}
void Close() {
buffer_.WriteValue(node_generator_.LocalCount());
buffer_.WriteValue(edge_generator_.LocalCount());
buffer_.WriteValue(buffer_.hash());
buffer_.Close();
}
private:
gid::Generator node_generator_{0};
gid::Generator edge_generator_{0};
HashedFileWriter buffer_;
communication::bolt::BaseEncoder<HashedFileWriter> encoder_{buffer_};
};
// Helper class for tracking info about the generated graph.
class GraphState {
public:
// Tracks that the given node has the given label.
void AddNode(const std::string &label, int64_t node_bolt_id) {
auto found = label_nodes_.find(label);
if (found == label_nodes_.end())
label_nodes_.emplace(label, std::vector<int64_t>{node_bolt_id});
else
found->second.emplace_back(node_bolt_id);
}
// Gets the ID of a random node that has the given label.
int64_t RandomNode(const std::string &label) {
auto found = label_nodes_.find(label);
CHECK(found != label_nodes_.end()) << "Label not found";
return found->second[rand_(gen_) * found->second.size()];
}
private:
// Maps labels to node bolt_ids
std::unordered_map<std::string, std::vector<int64_t>> label_nodes_;
// Random generator
std::mt19937 gen_{std::random_device{}()};
std::uniform_real_distribution<> rand_{0.0, 1.0};
};
// Helper class for property value generation.
class ValueGenerator {
using json = nlohmann::json;
using TypedValue = query::TypedValue;
public:
// Generates the whole property map based on the given config.
std::unordered_map<std::string, query::TypedValue> MakeProperties(
const json &config) {
std::unordered_map<std::string, query::TypedValue> props;
if (config.is_null()) return props;
CHECK(config.is_object()) << "Properties config must be a dict";
for (auto it = config.begin(); it != config.end(); it++) {
auto value = MakeValue(it.value());
if (value) props.emplace(it.key(), *value);
}
return props;
}
// Generates a single value based on the given config.
std::experimental::optional<TypedValue> MakeValue(const json &config) {
if (config.is_object()) {
const std::string &type = config["type"];
const auto &param = config["param"];
if (type == "primitive")
return Primitive(param);
else if (type == "counter")
return TypedValue(Counter(param));
else if (type == "optional")
return Optional(param);
else if (type == "bernoulli")
return TypedValue(Bernoulli(param));
else if (type == "randint")
return TypedValue(RandInt(param));
else if (type == "randstring")
return TypedValue(RandString(param));
else
LOG(FATAL) << "Unknown value type";
} else
return Primitive(config);
}
TypedValue Primitive(const json &config) {
if (config.is_string()) return config.get<std::string>();
if (config.is_number_integer()) return config.get<int64_t>();
if (config.is_number_float()) return config.get<double>();
if (config.is_boolean()) return config.get<bool>();
LOG(FATAL) << "Unsupported primitive type";
}
int64_t Counter(const std::string &name) {
auto found = counters_.find(name);
if (found == counters_.end()) {
counters_.emplace(name, 1);
return 0;
} else {
return found->second++;
}
}
int64_t RandInt(const json &range) {
CHECK(range.is_array() && range.size() == 2)
<< "RandInt value gen config must be a list with 2 elements";
auto from = MakeValue(range[0])->ValueInt();
auto to = MakeValue(range[1])->ValueInt();
CHECK(from < to) << "RandInt lower range must be lesser then upper range";
return (int64_t)(rand_(gen_) * (to - from)) + from;
}
std::string RandString(const json &length) {
static const char alphanum[] =
"0123456789"
"ABCDEFGHIJKLMNOPQRSTUVWXYZ"
"abcdefghijklmnopqrstuvwxyz";
int length_int = MakeValue(length)->ValueInt();
std::string r_val(length_int, 'a');
for (int i = 0; i < length_int; ++i)
r_val[i] = alphanum[(int64_t)(rand_(gen_) * (sizeof(alphanum) - 1))];
return r_val;
}
bool Bernoulli(double p) { return rand_(gen_) < p; }
std::experimental::optional<TypedValue> Optional(const json &config) {
CHECK(config.is_array() && config.size() == 2)
<< "Optional value gen config must be a list with 2 elements";
return Bernoulli(config[0]) ? MakeValue(config[1])
: std::experimental::nullopt;
}
private:
std::mt19937 gen_{std::random_device{}()};
std::uniform_real_distribution<> rand_{0.0, 1.0};
std::unordered_map<std::string, int64_t> counters_;
};
nlohmann::json GetWithDefault(const nlohmann::json &object,
const std::string &key,
const nlohmann::json &default_value) {
const auto &found = object.find(key);
if (found == object.end()) return default_value;
return *found;
}
int main(int argc, char **argv) { int main(int argc, char **argv) {
LOG(FATAL) << "Doesn't work with the newest format - waiting for refactor";
gflags::ParseCommandLineFlags(&argc, &argv, true); gflags::ParseCommandLineFlags(&argc, &argv, true);
google::InitGoogleLogging(argv[0]); google::InitGoogleLogging(argv[0]);
@ -284,49 +87,8 @@ int main(int argc, char **argv) {
config_file >> config; config_file >> config;
} }
Writer writer(FLAGS_out); GraphState state = BuildFromConfig(FLAGS_num_workers, config);
GraphState state; WriteToSnapshot(state, FLAGS_out);
ValueGenerator value_generator;
std::vector<std::string> indexes;
for (const auto &index : GetWithDefault(config, "indexes", {}))
for (const auto &index_part : utils::Split(index, "."))
indexes.push_back(index_part);
writer.WriteList(indexes);
// Create nodes
const auto &nodes_config = config["nodes"];
CHECK(nodes_config.is_array() && nodes_config.size() > 0)
<< "Generator config must have 'nodes' array with at least one element";
for (const auto &node_config : config["nodes"]) {
CHECK(node_config.is_object()) << "Node config must be a dict";
for (int i = 0; i < node_config["count"]; i++) {
const auto &labels_config = node_config["labels"];
CHECK(labels_config.is_array()) << "Must provide an array of node labels";
CHECK(node_config.size() > 0)
<< "Node labels array must contain at lest one element";
auto node_bolt_id = writer.WriteNode(
labels_config,
value_generator.MakeProperties(node_config["properties"]));
for (const auto &label : labels_config)
state.AddNode(label, node_bolt_id);
}
}
// Create edges
for (const auto &edge_config : config["edges"]) {
CHECK(edge_config.is_object()) << "Edge config must be a dict";
const std::string &from = edge_config["from"];
const std::string &to = edge_config["to"];
for (int i = 0; i < edge_config["count"]; i++)
writer.WriteEdge(edge_config["type"],
value_generator.MakeProperties(
GetWithDefault(edge_config, "properties", nullptr)),
state.RandomNode(from), state.RandomNode(to));
}
writer.Close();
return 0; return 0;
} }

View File

@ -0,0 +1,250 @@
#pragma once
#include <functional>
#include <random>
#include <unordered_map>
#include "cppitertools/itertools.hpp"
#include "json/json.hpp"
#include "storage/gid.hpp"
#include "storage/property_value.hpp"
#include "utils/string.hpp"
#include "value_generator.hpp"
namespace snapshot_generation {
nlohmann::json GetWithDefault(const nlohmann::json &object,
const std::string &key,
const nlohmann::json &default_value) {
const auto &found = object.find(key);
if (found == object.end()) return default_value;
return *found;
}
struct Node {
gid::Gid gid;
std::vector<std::string> labels;
std::unordered_map<std::string, PropertyValue> props;
std::vector<gid::Gid> in_edges;
std::vector<gid::Gid> out_edges;
};
struct Edge {
gid::Gid gid;
gid::Gid from;
gid::Gid to;
std::string type;
std::unordered_map<std::string, PropertyValue> props;
};
/// Helper class for tracking info about the generated graph.
class GraphState {
public:
explicit GraphState(int num_workers)
: num_workers_(num_workers),
worker_nodes_(num_workers),
worker_edges_(num_workers) {
for (int worker_id = 0; worker_id < num_workers; ++worker_id) {
edge_generators_.emplace_back(
std::make_unique<gid::Generator>(worker_id));
node_generators_.emplace_back(
std::make_unique<gid::Generator>(worker_id));
}
}
int NumWorkers() { return num_workers_; }
int64_t NumNodesOnWorker(int worker_id) {
return node_generators_[worker_id]->LocalCount();
}
int64_t NumEdgesOnWorker(int worker_id) {
return edge_generators_[worker_id]->LocalCount();
}
auto &NodesWithLabel(const std::string &label, int worker_id) {
return worker_nodes_[worker_id][label];
}
auto NodesWithLabel(const std::string &label) {
return iter::chain.from_iterable(
iter::imap([ this, label ](int worker_id) -> auto & {
return NodesWithLabel(label, worker_id);
},
iter::range(num_workers_)));
}
auto NodesOnWorker(int worker_id) {
return iter::chain.from_iterable(
iter::imap([](auto &p) -> auto & { return p.second; },
worker_nodes_[worker_id]));
}
auto EdgesOnWorker(int worker_id) {
return iter::chain.from_iterable(
iter::imap([](auto &p) -> auto & { return p.second; },
worker_edges_[worker_id]));
}
gid::Gid &RandomNode(const std::string &label, int worker_id) {
CHECK(0 <= worker_id && worker_id < (int)worker_nodes_.size())
<< "Worker ID should be between 0 and " << worker_nodes_.size() - 1;
auto &label_nodes = worker_nodes_[worker_id];
auto found = label_nodes.find(label);
CHECK(found != label_nodes.end()) << "Label not found";
return found->second[rand_(gen_) * found->second.size()];
}
gid::Gid &RandomNode(const std::string &label) {
return RandomNode(label, rand_(gen_) * worker_nodes_.size());
}
gid::Gid &RandomNodeOnOtherWorker(const std::string &label, int worker_id) {
int worker_id2 = rand_(gen_) * (worker_nodes_.size() - 1);
if (worker_id2 >= worker_id) ++worker_id2;
return RandomNode(label, worker_id2);
}
gid::Gid CreateNode(
int worker_id, const std::vector<std::string> &labels,
const std::unordered_map<std::string, PropertyValue> &props) {
auto node_gid =
node_generators_[worker_id]->Next(std::experimental::nullopt);
nodes_[node_gid] = {node_gid, labels, props, {}, {}};
for (const auto &label : labels) {
worker_nodes_[worker_id][label].push_back(node_gid);
}
return node_gid;
}
gid::Gid CreateEdge(
gid::Gid from, gid::Gid to, const std::string &type,
const std::unordered_map<std::string, PropertyValue> &props) {
int worker_id = gid::CreatorWorker(from);
auto edge_gid =
edge_generators_[worker_id]->Next(std::experimental::nullopt);
nodes_[from].out_edges.emplace_back(edge_gid);
nodes_[to].in_edges.emplace_back(edge_gid);
edges_[edge_gid] = Edge{edge_gid, from, to, type, props};
worker_edges_[worker_id][type].push_back(edge_gid);
return edge_gid;
}
auto &GetNode(gid::Gid gid) { return nodes_[gid]; }
auto &GetEdge(gid::Gid gid) { return edges_[gid]; }
auto &GetNodes() { return nodes_; }
auto &GetEdges() { return edges_; }
void CreateIndex(std::string label, std::string property) {
indices_.emplace_back(std::move(label));
indices_.emplace_back(std::move(property));
}
auto &Indices() { return indices_; }
private:
typedef std::unordered_map<std::string, std::vector<gid::Gid>> LabelGid;
int num_workers_;
std::vector<std::string> indices_;
std::vector<LabelGid> worker_nodes_;
std::vector<LabelGid> worker_edges_;
std::unordered_map<gid::Gid, Node> nodes_;
std::unordered_map<gid::Gid, Edge> edges_;
std::vector<std::unique_ptr<gid::Generator>> edge_generators_;
std::vector<std::unique_ptr<gid::Generator>> node_generators_;
std::mt19937 gen_{std::random_device{}()};
std::uniform_real_distribution<> rand_{0.0, 1.0};
};
int Worker(gid::Gid gid) { return gid::CreatorWorker(gid); }
GraphState BuildFromConfig(int num_workers, const nlohmann::json &config) {
ValueGenerator value_generator;
GraphState state(num_workers);
for (const auto &index : GetWithDefault(config, "indexes", {})) {
auto index_parts = utils::Split(index, ".");
CHECK(index_parts.size() == 2) << "Index format should be Label.Property";
state.CreateIndex(index_parts[0], index_parts[1]);
}
CHECK(config["nodes"].is_array() && config["nodes"].size() > 0)
<< "Generator config must have 'nodes' array with at least one "
"element";
for (const auto &node_config : config["nodes"]) {
CHECK(node_config.is_object()) << "Node config must be a dict";
const auto &labels = node_config["labels"];
CHECK(labels.is_array()) << "Must provide an array of node labels";
CHECK(node_config.size() > 0)
<< "Node labels array must contain at least one element";
for (int i = 0; i < node_config["count_per_worker"]; ++i) {
for (int worker_id = 0; worker_id < num_workers; ++worker_id) {
const auto properties =
value_generator.MakeProperties(node_config["properties"]);
state.CreateNode(worker_id, labels, properties);
}
}
}
int num_hops = 0;
auto get_edge_endpoint = [num_workers, &state, &num_hops, &value_generator](
gid::Gid from, std::string label_to,
double hop_probability) {
if (num_workers > 1 && value_generator.Bernoulli(hop_probability)) {
++num_hops;
return state.RandomNodeOnOtherWorker(label_to, Worker(from));
}
return state.RandomNode(label_to, Worker(from));
};
for (const auto &edge_config : config["edges"]) {
CHECK(edge_config.is_object()) << "Edge config must be a dict";
const std::string &label_from = edge_config["from"];
const std::string &label_to = edge_config["to"];
const std::string &type = edge_config["type"];
const double hop_probability = edge_config["hop_probability"];
if (edge_config["kind"] == "random") {
for (int i = 0; i < edge_config["count"]; i++) {
gid::Gid from = state.RandomNode(label_from);
gid::Gid to = get_edge_endpoint(from, label_to, hop_probability);
const auto &props = value_generator.MakeProperties(
GetWithDefault(edge_config, "properties", nullptr));
state.CreateEdge(from, to, type, props);
}
}
if (edge_config["kind"] == "unique") {
for (const auto &from : state.NodesWithLabel(label_from)) {
gid::Gid to = get_edge_endpoint(from, label_to, hop_probability);
const auto &props = value_generator.MakeProperties(
GetWithDefault(edge_config, "properties", nullptr));
state.CreateEdge(from, to, type, props);
}
}
}
for (int worker_id = 0; worker_id < num_workers; ++worker_id) {
LOG(INFO) << "-- Summary for worker: " << worker_id;
LOG(INFO) << "---- Total nodes: " << state.NumNodesOnWorker(worker_id);
LOG(INFO) << "---- Total edges: " << state.NumEdgesOnWorker(worker_id);
}
LOG(INFO) << "-- Total number of hops: " << num_hops;
return state;
}
} // namespace snapshot_generation

View File

@ -0,0 +1,143 @@
#pragma once
#include <string>
#include <vector>
#include "communication/bolt/v1/encoder/base_encoder.hpp"
#include "durability/paths.hpp"
#include "durability/version.hpp"
#include "query/typed_value.hpp"
#include "graph_state.hpp"
namespace snapshot_generation {
// Snapshot layout is described in durability/version.hpp
static_assert(durability::kVersion == 5,
"Wrong snapshot version, please update!");
class SnapshotWriter {
public:
SnapshotWriter(const std::string &path, int worker_id,
uint64_t vertex_generator_local_count = 0,
uint64_t edge_generator_local_count = 0)
: worker_id_(worker_id), buffer_(path) {
encoder_.WriteRAW(durability::kMagicNumber.data(),
durability::kMagicNumber.size());
encoder_.WriteTypedValue(durability::kVersion);
encoder_.WriteInt(worker_id_);
encoder_.WriteInt(vertex_generator_local_count);
encoder_.WriteInt(edge_generator_local_count);
encoder_.WriteInt(0);
encoder_.WriteList(std::vector<query::TypedValue>{});
}
// reference to `buffer_` gets broken when moving, so let's just forbid moving
SnapshotWriter(SnapshotWriter &&rhs) = delete;
SnapshotWriter &operator=(SnapshotWriter &&rhs) = delete;
template <typename TValue>
void WriteList(const std::vector<TValue> &list) {
encoder_.WriteList(
std::vector<query::TypedValue>(list.begin(), list.end()));
}
storage::VertexAddress DefaultVertexAddress(gid::Gid gid) {
return storage::VertexAddress(gid, gid::CreatorWorker(gid));
}
storage::EdgeAddress DefaultEdgeAddress(gid::Gid gid) {
return storage::EdgeAddress(gid, gid::CreatorWorker(gid));
}
void WriteInlineEdge(const Edge &edge, bool write_from) {
encoder_.WriteInt(DefaultEdgeAddress(edge.gid).raw());
encoder_.WriteInt(write_from ? DefaultVertexAddress(edge.from).raw()
: DefaultVertexAddress(edge.to).raw());
encoder_.WriteString(edge.type);
}
void WriteNode(const Node &node,
const std::unordered_map<gid::Gid, Edge> &edges) {
encoder_.WriteRAW(underlying_cast(communication::bolt::Marker::TinyStruct) +
3);
encoder_.WriteRAW(underlying_cast(communication::bolt::Signature::Node));
encoder_.WriteInt(node.gid);
WriteList(node.labels);
encoder_.WriteMap(node.props);
encoder_.WriteInt(node.in_edges.size());
for (const auto &edge_idx : node.in_edges) {
WriteInlineEdge(edges.at(edge_idx), true);
}
encoder_.WriteInt(node.out_edges.size());
for (const auto &edge_idx : node.out_edges) {
WriteInlineEdge(edges.at(edge_idx), false);
}
++nodes_written_;
}
void WriteEdge(const Edge &edge) {
encoder_.WriteRAW(underlying_cast(communication::bolt::Marker::TinyStruct) +
5);
encoder_.WriteRAW(
underlying_cast(communication::bolt::Signature::Relationship));
encoder_.WriteInt(edge.gid);
encoder_.WriteInt(edge.from);
encoder_.WriteInt(edge.to);
encoder_.WriteString(edge.type);
encoder_.WriteMap(edge.props);
++edges_written_;
}
void Close() {
buffer_.WriteValue(nodes_written_);
buffer_.WriteValue(edges_written_);
buffer_.WriteValue(buffer_.hash());
buffer_.Close();
}
private:
int worker_id_;
int64_t nodes_written_{0};
int64_t edges_written_{0};
HashedFileWriter buffer_;
communication::bolt::BaseEncoder<HashedFileWriter> encoder_{buffer_};
};
void WriteToSnapshot(GraphState &state, const std::string &path) {
for (int worker_id = 0; worker_id < state.NumWorkers(); ++worker_id) {
const std::experimental::filesystem::path durability_dir =
path / std::experimental::filesystem::path("worker_" +
std::to_string(worker_id));
if (!durability::EnsureDir(durability_dir / "snapshots")) {
LOG(ERROR) << "Unable to create durability directory!";
exit(0);
}
const auto snapshot_file =
durability::MakeSnapshotPath(durability_dir, worker_id);
SnapshotWriter writer(snapshot_file, worker_id,
state.NumNodesOnWorker(worker_id),
state.NumEdgesOnWorker(worker_id));
writer.WriteList(state.Indices());
for (const auto &node : state.NodesOnWorker(worker_id)) {
writer.WriteNode(state.GetNode(node), state.GetEdges());
}
for (const auto &edge : state.EdgesOnWorker(worker_id)) {
writer.WriteEdge(state.GetEdge(edge));
}
writer.Close();
}
}
} // namespace snapshot_generation

View File

@ -0,0 +1,107 @@
#pragma once
#include <experimental/optional>
#include <random>
#include <unordered_map>
#include "glog/logging.h"
#include "json/json.hpp"
#include "storage/property_value.hpp"
namespace snapshot_generation {
// Helper class for property value generation.
class ValueGenerator {
using json = nlohmann::json;
public:
// Generates the whole property map based on the given config.
std::unordered_map<std::string, PropertyValue> MakeProperties(
const json &config) {
std::unordered_map<std::string, PropertyValue> props;
if (config.is_null()) return props;
CHECK(config.is_object()) << "Properties config must be a dict";
for (auto it = config.begin(); it != config.end(); it++) {
auto value = MakeValue(it.value());
if (value) props.emplace(it.key(), *value);
}
return props;
}
// Generates a single value based on the given config.
std::experimental::optional<PropertyValue> MakeValue(const json &config) {
if (config.is_object()) {
const std::string &type = config["type"];
const auto &param = config["param"];
if (type == "primitive") return Primitive(param);
if (type == "counter") return Counter(param);
if (type == "optional") return Optional(param);
if (type == "bernoulli") return Bernoulli(param);
if (type == "randint") return RandInt(param);
if (type == "randstring") return RandString(param);
LOG(FATAL) << "Unknown value type";
} else {
return Primitive(config);
}
}
// Returns whatever value is stored in the JSON
PropertyValue Primitive(const json &config) {
if (config.is_string()) return config.get<std::string>();
if (config.is_number_integer()) return config.get<int64_t>();
if (config.is_number_float()) return config.get<double>();
if (config.is_boolean()) return config.get<bool>();
LOG(FATAL) << "Unsupported primitive type";
}
// Increments the counter and returns its current value
int64_t Counter(const std::string &name) { return counters_[name]++; }
// Generates a random integer in range specified by JSON config
int64_t RandInt(const json &range) {
CHECK(range.is_array() && range.size() == 2)
<< "RandInt value gen config must be a list with 2 elements";
auto from = MakeValue(range[0])->Value<int64_t>();
auto to = MakeValue(range[1])->Value<int64_t>();
CHECK(from < to) << "RandInt lower range must be lesser than upper range";
return (int64_t)(rand_(gen_) * (to - from)) + from;
}
// Generates a random alphanumeric string with length specified by JSON config
std::string RandString(const json &length) {
static const char alphanum[] =
"0123456789"
"ABCDEFGHIJKLMNOPQRSTUVWXYZ"
"abcdefghijklmnopqrstuvwxyz";
int length_int = MakeValue(length)->Value<int64_t>();
std::string r_val(length_int, 'a');
for (int i = 0; i < length_int; ++i)
r_val[i] = alphanum[(int64_t)(rand_(gen_) * strlen(alphanum))];
return r_val;
}
// Returns true with given probability
bool Bernoulli(double p) { return rand_(gen_) < p; }
// Returns a value specified by config with some probability, and nullopt
// otherwise
std::experimental::optional<PropertyValue> Optional(const json &config) {
CHECK(config.is_array() && config.size() == 2)
<< "Optional value gen config must be a list with 2 elements";
return Bernoulli(config[0]) ? MakeValue(config[1])
: std::experimental::nullopt;
}
private:
std::mt19937 gen_{std::random_device{}()};
std::uniform_real_distribution<> rand_{0.0, 1.0};
std::unordered_map<std::string, int64_t> counters_;
};
} // namespace snapshot_generation