Implement Louvain community detection algorithm

Summary: Louvain implementation according to original paper.

Reviewers: dsantl

Reviewed By: dsantl

Subscribers: pullbot

Differential Revision: https://phabricator.memgraph.io/D2556
This commit is contained in:
Ivan Paljak 2019-11-19 14:04:51 +01:00
parent 29590f1112
commit ae0be9032e
4 changed files with 322 additions and 0 deletions

View File

@ -0,0 +1,18 @@
/// @file
///
/// The file contains function declarations of several community-detection
/// graph algorithms.
#pragma once
#include "data_structures/graph.hpp"
namespace algorithms {
/// Detects communities of an unidrected, weighted graph using the Louvain
/// algorithm. The algorithm attempts to maximze the modularity of a weighted
/// graph.
///
/// @param G pointer to an undirected, weighted graph which may contain
/// self-loops.
void Louvain(comdata::Graph *G);
} // namespace algorithms

View File

@ -0,0 +1,89 @@
#include "algorithms/algorithms.hpp"
#include <algorithm>
#include <map>
#include <random>
#include <unordered_map>
#include <glog/logging.h>
namespace {
void OptimizeLocally(comdata::Graph *G) {
// We will consider local optimizations uniformly at random.
std::random_device rd;
std::mt19937 g(rd());
std::vector<uint32_t> p(G->Size());
std::iota(p.begin(), p.end(), 0);
std::shuffle(p.begin(), p.end(), g);
double total_w = G->TotalWeight();
bool stable = false;
while (!stable) {
stable = true;
for (uint32_t node_id : p) {
std::unordered_map<uint32_t, double> c_contrib;
c_contrib[G->Community(node_id)] = 0;
for (const auto &neigh : G->Neighbours(node_id)) {
uint32_t nxt_id = neigh.dest;
double weight = neigh.weight;
double contrib = weight - G->IncidentWeight(node_id) *
G->IncidentWeight(nxt_id) / (2 * total_w);
c_contrib[G->Community(nxt_id)] += contrib;
}
auto best_c = std::max_element(c_contrib.begin(), c_contrib.end(),
[](const std::pair<uint32_t, double> &p1,
const std::pair<uint32_t, double> &p2) {
return p1.second < p2.second;
});
if (best_c->second - c_contrib[G->Community(node_id)] > 0) {
G->SetCommunity(node_id, best_c->first);
stable = false;
}
}
}
}
} // anonymous namespace
namespace algorithms {
void Louvain(comdata::Graph *G) {
OptimizeLocally(G);
// Collapse the locally optimized graph.
uint32_t collapsed_nodes = G->NormalizeCommunities();
if (collapsed_nodes == G->Size()) return;
comdata::Graph collapsed_G(collapsed_nodes);
std::map<std::pair<uint32_t, uint32_t>, double> collapsed_edges;
for (uint32_t node_id = 0; node_id < G->Size(); ++node_id) {
std::unordered_map<uint32_t, double> edges;
for (const auto &neigh : G->Neighbours(node_id)) {
uint32_t nxt_id = neigh.dest;
double weight = neigh.weight;
if (G->Community(nxt_id) < G->Community(node_id)) continue;
edges[G->Community(nxt_id)] += weight;
}
for (const auto &neigh : edges) {
uint32_t a = std::min(G->Community(node_id), neigh.first);
uint32_t b = std::max(G->Community(node_id), neigh.first);
collapsed_edges[{a, b}] += neigh.second;
}
}
for (const auto &p : collapsed_edges)
collapsed_G.AddEdge(p.first.first, p.first.second, p.second);
// Repeat until no local optimizations can be found.
Louvain(&collapsed_G);
// Propagate results from collapsed graph.
for (uint32_t node_id = 0; node_id < G->Size(); ++node_id)
G->SetCommunity(node_id, collapsed_G.Community(G->Community(node_id)));
G->NormalizeCommunities();
}
} // namespace algorithms

View File

@ -0,0 +1,106 @@
#include "data_structures/graph.hpp"
#include <numeric>
#include <unordered_map>
#include <unordered_set>
#include <vector>
#include <glog/logging.h>
namespace comdata {
Graph::Graph(uint32_t n_nodes) : n_nodes_(n_nodes), total_w_(0) {
adj_list_.resize(n_nodes, {});
inc_w_.resize(n_nodes, 0);
// each node starts as its own separate community.
community_.resize(n_nodes);
std::iota(community_.begin(), community_.end(), 0);
}
uint32_t Graph::Size() const {
return n_nodes_;
}
uint32_t Graph::Community(uint32_t node) const {
CHECK(node < n_nodes_) << "Node index out of range";
return community_[node];
}
void Graph::SetCommunity(uint32_t node, uint32_t c) {
CHECK(node < n_nodes_) << "Node index out of range";
community_[node] = c;
}
uint32_t Graph::NormalizeCommunities() {
std::set<uint32_t> c_id(community_.begin(), community_.end());
std::unordered_map<uint32_t, uint32_t> cmap;
uint32_t id = 0;
for (uint32_t c : c_id) {
cmap[c] = id;
++id;
}
for (uint32_t node_id = 0; node_id < n_nodes_; ++node_id)
community_[node_id] = cmap[community_[node_id]];
return id;
}
void Graph::AddEdge(uint32_t node1, uint32_t node2, double weight) {
CHECK(node1 < n_nodes_) << "Node index out of range";
CHECK(node2 < n_nodes_) << "Node index out of range";
CHECK(weight > 0) << "Weights must be positive";
CHECK(edges_.find({node1, node2}) == edges_.end()) << "Edge already exists";
edges_.emplace(node1, node2);
edges_.emplace(node2, node1);
total_w_ += weight;
adj_list_[node1].emplace_back(node2, weight);
inc_w_[node1] += weight;
if (node1 != node2) {
adj_list_[node2].emplace_back(node1, weight);
inc_w_[node2] += weight;
}
}
uint32_t Graph::Degree(uint32_t node) const {
CHECK(node < n_nodes_) << "Node index out of range";
return static_cast<uint32_t>(adj_list_[node].size());
}
double Graph::IncidentWeight(uint32_t node) const {
CHECK(node < n_nodes_) << "Node index out of range";
return inc_w_[node];
}
double Graph::TotalWeight() const {
return total_w_;
}
double Graph::Modularity() const {
double ret = 0;
// Since all weights should be positive, this implies that our graph has
// no edges.
if (total_w_ == 0)
return 0;
for (uint32_t i = 0; i < n_nodes_; ++i) {
for (const auto &neigh : adj_list_[i]) {
uint32_t j = neigh.dest;
double w = neigh.weight;
if (Community(i) != Community(j)) continue;
ret += w - (IncidentWeight(i) * IncidentWeight(j) / (2.0 * total_w_));
}
}
ret /= 2 * total_w_;
return ret;
}
const std::vector<Neighbour>& Graph::Neighbours(uint32_t node) const {
CHECK(node < n_nodes_) << "Node index out of range";
return adj_list_[node];
}
} // namespace comdata

View File

@ -0,0 +1,109 @@
/// @file graph.hpp
#pragma once
#include <cstdint>
#include <set>
#include <vector>
namespace comdata {
struct Neighbour {
uint32_t dest;
double weight;
Neighbour(uint32_t d, double w) : dest(d), weight(w) {}
};
/// Class which models a weighted, undirected graph with necessary
/// functionalities for community detection algorithms.
class Graph {
public:
/// Constructs a new graph with a given number of nodes and no edges between
/// them.
///
/// At the moment, the implementation assumes (and enforces) that all nodes
/// are indexed from 0 to n_nodes - 1. This will be changed in the final
/// implementation. - TODO(ipaljak)
///
/// @param n_nodes Number of nodes in the graph.
explicit Graph(uint32_t n_nodes);
/// @return number of nodes in the graph.
uint32_t Size() const;
/// Adds a bidirectional, weighted edge to the graph between the given
/// nodes. If both given nodes are the same, the method inserts a weighted
/// self-loop.
///
/// There should be no edges between the given nodes when before invoking
/// this method.
///
/// @param node1 index of an incident node.
/// @param node2 index of an incident node.
/// @param weight real value which represents the weight of the edge.
void AddEdge(uint32_t node1, uint32_t node2, double weight);
/// @param node index of node.
/// @return community where the node belongs to.
uint32_t Community(uint32_t node) const;
/// Adds a given node to a given community.
///
/// @param node index of node.
/// @param c community where the given node should go in.
void SetCommunity(uint32_t node, uint32_t c);
/// Normalizes the values of communities. More precisely, after invoking this
/// method communities will be indexed by successive integers starting from 0.
///
/// Note: this method is computationally expensive and takes O(|V|)
/// time, i.e., it traverses all nodes in the graph.
///
/// @return number of communities in the graph
uint32_t NormalizeCommunities();
/// Returns the number of incident edges to a given node. Self-loops
/// contribute a single edge to the degree.
///
/// @param node index of node.
/// @return degree of given node.
uint32_t Degree(uint32_t node) const;
/// Returns the total weight of incident edges to a given node. Weight
/// of a self loop contributes once to the total sum.
///
/// @param node index of node.
/// @return total incident weight of a given node.
double IncidentWeight(uint32_t node) const;
/// @return total weight of all edges in a graph.
double TotalWeight() const;
/// Calculates the modularity of the graph which is defined as a real value
/// between -1 and 1 that measures the density of links inside communities
/// compared to links between communities.
///
/// Note: this method is computationally expensive and takes O(|V| + |E|)
/// time, i.e., it traverses the entire graph.
///
/// @return modularity of the graph.
double Modularity() const;
/// Returns nodes adjacent to a given node.
///
/// @param node index of node.
/// @return list of neighbouring nodes.
const std::vector<Neighbour>& Neighbours(uint32_t node) const;
private:
uint32_t n_nodes_;
double total_w_;
std::vector<std::vector<Neighbour>> adj_list_;
std::set<std::pair<uint32_t, uint32_t>> edges_;
std::vector<double> inc_w_;
std::vector<uint32_t> community_;
};
} // namespace comdata