Implement the PageRank module

Reviewers: teon.banek Reviewed By: teon.banek Subscribers: pullbot Differential Revision: https://phabricator.memgraph.io/D2730
2020-03-19 10:05:27 +01:00 · 2020-03-19 10:05:27 +01:00 · e5b3414335
commit e5b3414335
parent b7738c64b3
2 changed files with 148 additions and 1 deletions
--- a/include/mgp.py
+++ b/include/mgp.py
@ -439,12 +439,13 @@ class Record:

 class Vertices:
    '''Iterable over vertices in a graph.'''
-    __slots__ = ('_graph',)
+    __slots__ = ('_graph', '_len')

    def __init__(self, graph):
        if not isinstance(graph, _mgp.Graph):
            raise TypeError("Expected '_mgp.Graph', got '{}'".format(type(graph)))
        self._graph = graph
+        self._len = None

    def __deepcopy__(self, memo):
        # This is the same as the shallow copy, because we want to share the
@ -468,6 +469,18 @@ class Vertices:
                raise InvalidContextError()
            vertex = vertices_it.next()

+    def __contains__(self, vertex):
+        try:
+            _ = self.graph.get_vertex_by_id(vertex.id)
+            return True
+        except IndexError:
+            return False
+
+    def __len__(self):
+        if not self._len:
+            self._len = sum(1 for _ in self)
+        return self._len
+

 class Graph:
    '''State of the graph database in current ProcCtx.'''
--- a/query_modules/pagerank.py
+++ b/query_modules/pagerank.py
@ -0,0 +1,134 @@
+import mgp
+import networkx as nx
+from itertools import chain
+
+
+class VertexDictionary:
+    def __init__(self, graph, prop):
+        self.graph = graph
+        self.prop = prop
+        self.len = None
+
+    def get(self, vertex, default=None):
+        return vertex.properties.get(self.prop, default=default)
+
+    def items(self):
+        for v in self.graph.vertices:
+            if self.prop in v.properties:
+                yield v, v.properties[self.prop]
+
+    def keys(self):
+        for k, v in self.items():
+            yield k
+
+    def values(self):
+        for k, v in self.items():
+            yield v
+
+    def __len__(self):
+        if not self.len:
+            self.len = sum(1 for _ in self.items())
+        return self.len
+
+    def __iter__(self):
+        for k, v in self.items():
+            yield k
+
+    def __getitem__(self, vertex):
+        try:
+            return vertex.properties[self.prop]
+        except KeyError:
+            raise KeyError(("Vertex {} doesn\t have the required " +
+                            "property '{}'").format(vertex.id, self.prop))
+
+    def __contains__(self, vertex):
+        return vertex in self.graph.vertices and self.prop in vertex.properties
+
+
+@mgp.read_proc
+def pagerank(ctx: mgp.ProcCtx,
+             alpha: mgp.Number = 0.85,
+             personalization: mgp.Nullable[str] = None,
+             max_iter: int = 100,
+             tol: mgp.Number = 1e-06,
+             nstart: mgp.Nullable[str] = None,
+             weight: mgp.Nullable[str] = 'weight',
+             dangling: mgp.Nullable[str] = None
+             ) -> mgp.Record(node=mgp.Vertex, rank=float):
+    '''Run the PageRank algorithm on the whole graph.
+
+    The available parameters are:
+
+    - `alpha` -- Damping parameter.
+
+    - `personalization` -- The "personalization vector". A string specifying
+      the property that will be looked up for every node to give the
+      personalization value. If a node doesn't have the specified property, its
+      personalization value will be zero. By default, a uniform distribution is
+      used.
+
+    - `max_iter` -- Maximum number of iterations in the power method eigenvalue
+      solver.
+
+    - `tol` -- Error tolerance used to check for convergence in the power
+      eigenvalue method solver.
+
+    - `nstart` -- A string specifying the property that will be looked up for
+      every node to give the starting value for the iteration.
+
+    - `weight` -- A string specifying the property that will be looked up for
+      every edge to give the weight. If None or if the property doesn't exist,
+      weights are set to 1.
+
+    - `dangling` -- The outedges to be assigned to any "dangling" nodes, i.e.,
+      nodes without any outedges. A string specifying the property that will be
+      looked up for every node to give the weight of the outedge that points to
+      that node. By default, dangling nodes are given outedges according to the
+      personalization vector. This must be selected to result in an irreducible
+      transition matrix. It may be common to have the dangling dictionary be
+      the same as the personalization dictionary.
+
+    Return a single record for every node. Each record has two fields, `node`
+    and `rank`, which together give the calculated rank for the node.
+
+    As an example, the following openCypher query calculates the ranks of all
+    the nodes in the graph using the PageRank algorithm. The personalization
+    value for every node is taken from its property named 'personalization',
+    while the `alpha` and `max_iter` parameters are set to 0.85 and 150
+    respectively:
+
+    CALL pagerank.pagerank(0.85, 'personalization', 150) YIELD *;
+
+    '''
+    def to_vertex_dictionary(prop):
+        return None if prop is None else VertexDictionary(ctx.graph, prop)
+
+    def make_and_check_vertex(v):
+        for prop in (personalization, nstart, dangling):
+            if prop is None:
+                continue
+            if not isinstance(v.properties.get(prop, default=1), (int, float)):
+                raise TypeError(("Property '{}' of vertex '{}' needs to " +
+                                "be a number").format(prop, v.id))
+        return v
+
+    def make_and_check_edge(e):
+        if (weight is not None and
+            not isinstance(e.properties.get(weight, default=1), (int, float))):
+            raise TypeError("Property '{}' of edge '{}' needs to be a number"
+                            .format(weight, e.id))
+        return e.from_vertex, e.to_vertex, e.properties
+
+    g = nx.DiGraph()
+    g.add_nodes_from(make_and_check_vertex(v) for v in ctx.graph.vertices)
+    g.add_edges_from(make_and_check_edge(e)
+                     for v in ctx.graph.vertices
+                     for e in chain(v.in_edges, v.out_edges))
+
+    pg = nx.pagerank(g, alpha=alpha,
+                     personalization=to_vertex_dictionary(personalization),
+                     max_iter=max_iter, tol=tol,
+                     nstart=to_vertex_dictionary(nstart), weight=weight,
+                     dangling=to_vertex_dictionary(dangling))
+
+    return [mgp.Record(node=k, rank=v) for k, v in pg.items()]