Implement the PageRank module

Reviewers: teon.banek

Reviewed By: teon.banek

Subscribers: pullbot

Differential Revision: https://phabricator.memgraph.io/D2730
This commit is contained in:
Lovro Lugovic 2020-03-19 10:05:27 +01:00
parent b7738c64b3
commit e5b3414335
2 changed files with 148 additions and 1 deletions

View File

@ -439,12 +439,13 @@ class Record:
class Vertices:
'''Iterable over vertices in a graph.'''
__slots__ = ('_graph',)
__slots__ = ('_graph', '_len')
def __init__(self, graph):
if not isinstance(graph, _mgp.Graph):
raise TypeError("Expected '_mgp.Graph', got '{}'".format(type(graph)))
self._graph = graph
self._len = None
def __deepcopy__(self, memo):
# This is the same as the shallow copy, because we want to share the
@ -468,6 +469,18 @@ class Vertices:
raise InvalidContextError()
vertex = vertices_it.next()
def __contains__(self, vertex):
try:
_ = self.graph.get_vertex_by_id(vertex.id)
return True
except IndexError:
return False
def __len__(self):
if not self._len:
self._len = sum(1 for _ in self)
return self._len
class Graph:
'''State of the graph database in current ProcCtx.'''

134
query_modules/pagerank.py Normal file
View File

@ -0,0 +1,134 @@
import mgp
import networkx as nx
from itertools import chain
class VertexDictionary:
def __init__(self, graph, prop):
self.graph = graph
self.prop = prop
self.len = None
def get(self, vertex, default=None):
return vertex.properties.get(self.prop, default=default)
def items(self):
for v in self.graph.vertices:
if self.prop in v.properties:
yield v, v.properties[self.prop]
def keys(self):
for k, v in self.items():
yield k
def values(self):
for k, v in self.items():
yield v
def __len__(self):
if not self.len:
self.len = sum(1 for _ in self.items())
return self.len
def __iter__(self):
for k, v in self.items():
yield k
def __getitem__(self, vertex):
try:
return vertex.properties[self.prop]
except KeyError:
raise KeyError(("Vertex {} doesn\t have the required " +
"property '{}'").format(vertex.id, self.prop))
def __contains__(self, vertex):
return vertex in self.graph.vertices and self.prop in vertex.properties
@mgp.read_proc
def pagerank(ctx: mgp.ProcCtx,
alpha: mgp.Number = 0.85,
personalization: mgp.Nullable[str] = None,
max_iter: int = 100,
tol: mgp.Number = 1e-06,
nstart: mgp.Nullable[str] = None,
weight: mgp.Nullable[str] = 'weight',
dangling: mgp.Nullable[str] = None
) -> mgp.Record(node=mgp.Vertex, rank=float):
'''Run the PageRank algorithm on the whole graph.
The available parameters are:
- `alpha` -- Damping parameter.
- `personalization` -- The "personalization vector". A string specifying
the property that will be looked up for every node to give the
personalization value. If a node doesn't have the specified property, its
personalization value will be zero. By default, a uniform distribution is
used.
- `max_iter` -- Maximum number of iterations in the power method eigenvalue
solver.
- `tol` -- Error tolerance used to check for convergence in the power
eigenvalue method solver.
- `nstart` -- A string specifying the property that will be looked up for
every node to give the starting value for the iteration.
- `weight` -- A string specifying the property that will be looked up for
every edge to give the weight. If None or if the property doesn't exist,
weights are set to 1.
- `dangling` -- The outedges to be assigned to any "dangling" nodes, i.e.,
nodes without any outedges. A string specifying the property that will be
looked up for every node to give the weight of the outedge that points to
that node. By default, dangling nodes are given outedges according to the
personalization vector. This must be selected to result in an irreducible
transition matrix. It may be common to have the dangling dictionary be
the same as the personalization dictionary.
Return a single record for every node. Each record has two fields, `node`
and `rank`, which together give the calculated rank for the node.
As an example, the following openCypher query calculates the ranks of all
the nodes in the graph using the PageRank algorithm. The personalization
value for every node is taken from its property named 'personalization',
while the `alpha` and `max_iter` parameters are set to 0.85 and 150
respectively:
CALL pagerank.pagerank(0.85, 'personalization', 150) YIELD *;
'''
def to_vertex_dictionary(prop):
return None if prop is None else VertexDictionary(ctx.graph, prop)
def make_and_check_vertex(v):
for prop in (personalization, nstart, dangling):
if prop is None:
continue
if not isinstance(v.properties.get(prop, default=1), (int, float)):
raise TypeError(("Property '{}' of vertex '{}' needs to " +
"be a number").format(prop, v.id))
return v
def make_and_check_edge(e):
if (weight is not None and
not isinstance(e.properties.get(weight, default=1), (int, float))):
raise TypeError("Property '{}' of edge '{}' needs to be a number"
.format(weight, e.id))
return e.from_vertex, e.to_vertex, e.properties
g = nx.DiGraph()
g.add_nodes_from(make_and_check_vertex(v) for v in ctx.graph.vertices)
g.add_edges_from(make_and_check_edge(e)
for v in ctx.graph.vertices
for e in chain(v.in_edges, v.out_edges))
pg = nx.pagerank(g, alpha=alpha,
personalization=to_vertex_dictionary(personalization),
max_iter=max_iter, tol=tol,
nstart=to_vertex_dictionary(nstart), weight=weight,
dangling=to_vertex_dictionary(dangling))
return [mgp.Record(node=k, rank=v) for k, v in pg.items()]