memgraph/experimental/distributed/src/spinner.hpp

#include <algorithm>
#include <cassert>
#include <cstddef>
#include <experimental/tuple>
#include <iostream>
#include <numeric>
#include <random>
#include <tuple>
#include <vector>

#include "graph.hpp"

namespace spinner {
// const for balancing penalty
double c = 2.0;

/**
 * Returns the index of the maximum score in the given vector.
 * If there are multiple minimums, one is chosen at random.
 */
auto MaxRandom(const std::vector<double> &scores) {
  std::vector<size_t> best_indices;
  double current_max = std::numeric_limits<double>::lowest();

  for (size_t ind = 0; ind < scores.size(); ind++) {
    if (scores[ind] > current_max) {
      current_max = scores[ind];
      best_indices.clear();
    }
    if (scores[ind] == current_max) {
      best_indices.emplace_back(ind);
    }
  }

  return best_indices[rand() % best_indices.size()];
}

/**
 * Returns the index of the best (highest scored) worker
 * for the given node. If there are multiple workers with
 * the best score, node prefers to remain on the same worker
 * (if among the best), or one is chosen at random.
 *
 * @param distributed - the distributed system.
 * @param node - the node which is being evaluated.
 * @param penalties - a vector of penalties (per worker).
 * @param current_worker - the worker on which the given
 *  node is currently residing.
 * @return - std::pair<int, std::vector<double>> which is a
 * pair of (best worker, score_per_worker).
 */
auto BestWorker(const Distributed &distributed, const Node &node,
                const std::vector<double> &penalties, int current_worker) {
  // scores per worker
  std::vector<double> scores(distributed.WorkerCount(), 0.0);

  for (auto &edge : node.edges_in()) scores[edge.worker_id_] += 1.0;
  for (auto &edge : node.edges_out()) scores[edge.worker_id_] += 1.0;

  for (int worker = 0; worker < distributed.WorkerCount(); ++worker) {
    // normalize contribution of worker over neighbourhood size
    scores[worker] /= node.edges_out().size() + node.edges_in().size();
    // add balancing penalty
    scores[worker] -= penalties[worker];
  }

  // pick the best destination, but prefer to stay if you can
  size_t destination = MaxRandom(scores);
  if (scores[current_worker] == scores[destination])
    destination = current_worker;

  return std::make_pair(destination, scores);
}

/** Indication if Spinner worker penality is calculated based on
 * vertex or edge worker cardinalities */
enum class PenaltyType { Vertex, Edge };

/** Calcualtes Spinner penalties for workers in the given
 * distributed system. */
auto Penalties(const Distributed &distributed,
               PenaltyType penalty_type = PenaltyType::Edge) {
  std::vector<double> penalties;
  int64_t total_count{0};

  for (const auto &worker : distributed) {
    int64_t worker_count{0};
    switch (penalty_type) {
      case PenaltyType::Vertex:
        worker_count += worker.NodeCount();
        break;
      case PenaltyType::Edge:
        for (const auto &node_kv : worker) {
          // Spinner counts the edges on a worker as the sum
          // of degrees of nodes on that worker. In that sense
          // both incoming and outgoing edges are individually
          // added...
          worker_count += node_kv.second.edges_out().size();
          worker_count += node_kv.second.edges_in().size();
        }
        break;
    }
    total_count += worker_count;
    penalties.emplace_back(worker_count);
  }

  for (auto &penalty : penalties)
    penalty /= c * total_count / distributed.WorkerCount();

  return penalties;
}

/** Do one spinner step (modifying the given distributed) */
void PerformSpinnerStep(Distributed &distributed) {
  auto penalties = Penalties(distributed);

  // here a strategy can be injected for limiting
  // the number of movements performed in one step.
  // limiting could be based on (for example):
  //  - limiting the number of movements per worker
  //  - limiting only to movements that are above
  //    a treshold (score improvement or something)
  //  - not executing on all the workers (also prevents
  //    oscilations)
  //
  // in the first implementation just accumulate all
  // the movements and execute together.

  // relocation info: contains the address of the Node
  // that needs to relocate and it's destination worker
  std::vector<std::pair<GlobalAddress, int>> movements;

  for (const Worker &worker : distributed)
    for (const auto &gid_node_pair : worker) {
      // (best destination, scores) pair for node
      std::pair<int, std::vector<double>> destination_scores =
          BestWorker(distributed, gid_node_pair.second, penalties, worker.id_);
      if (destination_scores.first != worker.id_)
        movements.emplace_back(GlobalAddress(worker.id_, gid_node_pair.first),
                               destination_scores.first);
    }

  // execute movements. it is likely that in the real system
  // this will need to happen as a single db transaction
  for (const auto &m : movements) distributed.MoveNode(m.first, m.second);
}
}  // namespace spinner
Merged experimental repo. Summary: Fixed distributed init. Add CMakeLists to build experimentall/distribuedClosing unused Channels, work in progress. Make System the owner of Reactor. This entails changing shared_ptr -> unique_ptr and some pointers to references. Merged experimental repository into memgraph. Moved experimental repo to experimental directory. Removed obsolete experimental files. Added comments. Merge branch 'master' of https://phabricator.memgraph.io/source/experimental Subscription service unsubscribe. Add Close method on EventStream. Add placeholder for the configuration class. Remove comments. Merge branch 'master' of https://phabricator.memgraph.io/source/experimental Clean-up parameters for EventQueue. Merge branch 'master' of https://phabricator.memgraph.io/source/experimental Add Channel::serialize method implementation. Merge. Add docs on event stream. Clang-format merge conflicts. First implementations of serialize methods. Add hostname, port, and names as methods in Channel base class. Add reactor name and name methods to LocalChannel. Add reactor name to LocalChannel. Add name to LocalChannel. Add serialization service. Serialize_test removed. Merge branch 'master' of https://phabricator.memgraph.io/source/experimental Move Message to the end of communications files. Full example of serialization with cereal. Fix constructor calls. Merge branch 'master' of https://phabricator.memgraph.io/source/experimental Avoid using `FindChannel` in the transaction code. Merge branch 'master' of https://phabricator.memgraph.io/source/experimental Init script creates libs folder. Merge branch 'master' of https://phabricator.memgraph.io/source/experimental Add System pointer to Network. serialized_test binary is removed from the repo. Merge branch 'master' of https://phabricator.memgraph.io/source/experimental Cereal basic example. Merge branch 'master' of https://phabricator.memgraph.io/source/experimental Callbacks finished. Always open the main channel by default. Fixed callbacks, wrong number of emplace arguments. Callbacks WIP. Raise connector mutex to reactor level. Add argument to LockedPush. Fix data race in connector closing. Merge branch 'master' of https://phabricator.memgraph.io/source/experimental Add functional header. Fixed to make the changes work. Merge branch 'master' of https://phabricator.memgraph.io/source/experimental Merge branch 'master' of https://phabricator.memgraph.io/source/experimental Refactored connectors into Reactors Use shared pointer for the mutex. Rename to Open and Close in implementation file. Rename Create to Open and Destroy to Close. Merge branch 'master' of https://phabricator.memgraph.io/source/experimental Adding callback to Reactors; work in progress Add stubs for asynchronous channel resolution. Add stubs for the networking service. Replace reactor pointers with shared ptrs, disable System assignment. Forbid assignment. Replace raw channel pointers with shared pointers. Replace raw event stream pointer with shared pointer. Rename default stream name. Use recursive mutex in System. Main uses Spawn method. All files are formatted. Move thread local to a cpp file. Work in progress on Spawn method. Merge branch 'master' of https://phabricator.memgraph.io/source/experimental Kill out graph.hpp to make it compile Add Spawn method prototype. Fix return type. Merge branch 'master' of https://phabricator.memgraph.io/source/experimental Add method used to create nameless channels. Add format script. Introduce the Reactor base class. Merge branch 'master' of https://phabricator.memgraph.io/source/experimental Add compile script. added comments about class terminology Spinner rewrite (graph data structures and algo) Organize Spinner code Create working version Improves Spinner implementation and testing Spinner fix .arcconfig Merge branch 'master' of https://phabricator.memgraph.io/source/experimental Add graph Spinner work Spinner added Merge branch 'master' of https://phabricator.memgraph.io/source/experimental Add communication .clang-format + ycm config. Init. Distributed hackaton. Implementation of lock-free list from Petar Sirkovic. pro compiler Merge branch 'master' of https://phabricator.memgraph.io/source/experimental Implement Match Add test data. Insert quotes before and after props and labels Multiple node declarations, along with edges. After merge. Node property creations work now. Bug fix in visitor After merge. Implement node creation with labels. Implement boolean operators Tidy up ImplementedVistor. Implement expression6 (addition) Implement basic type visitor functions Cypher Visitor Implementation class created. Fix style. Fix template synrax in main.cpp Merge remote-tracking branch 'origin/master' Add pretty_print Update main and BaseVisitor to present return value. Headers included. Temporary fix. Antlr4 module reintroduced. Updateao git config. Fix trailing space. CMake 2.8 fix rerolled, 3.1 minimum version req. Fix for Cmake version 2.8 compatibility. Build works. Tidy src folder. Include generated files for antlr. Included antlr generated files. Changed directory structure. Cmake: include subdirectory. GenerateRuntime, partial. Add GenerateParser target to cmake. Remove main.cpp Merge remote-tracking branch 'origin/master' Add requirements Main file added. Run the lexer and parser with this. Add antlr_generated to baby_compiler Experimental memory_tracker and opencypher tck tests Reviewers: mislav.bradac Reviewed By: mislav.bradac Subscribers: pullbot Differential Revision: https://phabricator.memgraph.io/D627 2017-08-03 18:08:39 +08:00			`#include <algorithm>`
			`#include <cassert>`
			`#include <cstddef>`
			`#include <experimental/tuple>`
			`#include <iostream>`
			`#include <numeric>`
			`#include <random>`
			`#include <tuple>`
			`#include <vector>`

			`#include "graph.hpp"`

			`namespace spinner {`
			`// const for balancing penalty`
			`double c = 2.0;`

			`/**`
			`* Returns the index of the maximum score in the given vector.`
			`* If there are multiple minimums, one is chosen at random.`
			`*/`
			`auto MaxRandom(const std::vector<double> &scores) {`
			`std::vector<size_t> best_indices;`
			`double current_max = std::numeric_limits<double>::lowest();`

			`for (size_t ind = 0; ind < scores.size(); ind++) {`
			`if (scores[ind] > current_max) {`
			`current_max = scores[ind];`
			`best_indices.clear();`
			`}`
			`if (scores[ind] == current_max) {`
			`best_indices.emplace_back(ind);`
			`}`
			`}`

			`return best_indices[rand() % best_indices.size()];`
			`}`

			`/**`
			`* Returns the index of the best (highest scored) worker`
			`* for the given node. If there are multiple workers with`
			`* the best score, node prefers to remain on the same worker`
			`* (if among the best), or one is chosen at random.`
			`*`
			`* @param distributed - the distributed system.`
			`* @param node - the node which is being evaluated.`
			`* @param penalties - a vector of penalties (per worker).`
			`* @param current_worker - the worker on which the given`
			`* node is currently residing.`
			`* @return - std::pair<int, std::vector<double>> which is a`
			`* pair of (best worker, score_per_worker).`
			`*/`
			`auto BestWorker(const Distributed &distributed, const Node &node,`
			`const std::vector<double> &penalties, int current_worker) {`
			`// scores per worker`
			`std::vector<double> scores(distributed.WorkerCount(), 0.0);`

			`for (auto &edge : node.edges_in()) scores[edge.worker_id_] += 1.0;`
			`for (auto &edge : node.edges_out()) scores[edge.worker_id_] += 1.0;`

			`for (int worker = 0; worker < distributed.WorkerCount(); ++worker) {`
			`// normalize contribution of worker over neighbourhood size`
			`scores[worker] /= node.edges_out().size() + node.edges_in().size();`
			`// add balancing penalty`
			`scores[worker] -= penalties[worker];`
			`}`

			`// pick the best destination, but prefer to stay if you can`
			`size_t destination = MaxRandom(scores);`
			`if (scores[current_worker] == scores[destination])`
			`destination = current_worker;`

			`return std::make_pair(destination, scores);`
			`}`

			`/** Indication if Spinner worker penality is calculated based on`
			`* vertex or edge worker cardinalities */`
			`enum class PenaltyType { Vertex, Edge };`

			`/** Calcualtes Spinner penalties for workers in the given`
			`* distributed system. */`
			`auto Penalties(const Distributed &distributed,`
			`PenaltyType penalty_type = PenaltyType::Edge) {`
			`std::vector<double> penalties;`
			`int64_t total_count{0};`

			`for (const auto &worker : distributed) {`
			`int64_t worker_count{0};`
			`switch (penalty_type) {`
			`case PenaltyType::Vertex:`
			`worker_count += worker.NodeCount();`
			`break;`
			`case PenaltyType::Edge:`
			`for (const auto &node_kv : worker) {`
			`// Spinner counts the edges on a worker as the sum`
			`// of degrees of nodes on that worker. In that sense`
			`// both incoming and outgoing edges are individually`
			`// added...`
			`worker_count += node_kv.second.edges_out().size();`
			`worker_count += node_kv.second.edges_in().size();`
			`}`
			`break;`
			`}`
			`total_count += worker_count;`
			`penalties.emplace_back(worker_count);`
			`}`

			`for (auto &penalty : penalties)`
			`penalty /= c * total_count / distributed.WorkerCount();`

			`return penalties;`
			`}`

			`/** Do one spinner step (modifying the given distributed) */`
			`void PerformSpinnerStep(Distributed &distributed) {`
			`auto penalties = Penalties(distributed);`

			`// here a strategy can be injected for limiting`
			`// the number of movements performed in one step.`
			`// limiting could be based on (for example):`
			`// - limiting the number of movements per worker`
			`// - limiting only to movements that are above`
			`// a treshold (score improvement or something)`
			`// - not executing on all the workers (also prevents`
			`// oscilations)`
			`//`
			`// in the first implementation just accumulate all`
			`// the movements and execute together.`

			`// relocation info: contains the address of the Node`
			`// that needs to relocate and it's destination worker`
			`std::vector<std::pair<GlobalAddress, int>> movements;`

			`for (const Worker &worker : distributed)`
			`for (const auto &gid_node_pair : worker) {`
			`// (best destination, scores) pair for node`
			`std::pair<int, std::vector<double>> destination_scores =`
			`BestWorker(distributed, gid_node_pair.second, penalties, worker.id_);`
			`if (destination_scores.first != worker.id_)`
			`movements.emplace_back(GlobalAddress(worker.id_, gid_node_pair.first),`
			`destination_scores.first);`
			`}`

			`// execute movements. it is likely that in the real system`
			`// this will need to happen as a single db transaction`
			`for (const auto &m : movements) distributed.MoveNode(m.first, m.second);`
			`}`
			`} // namespace spinner`