diff --git a/README.md b/README.md index c58a7d2..aa64bbc 100644 --- a/README.md +++ b/README.md @@ -3,8 +3,7 @@ [![pypi](https://img.shields.io/pypi/v/graphblas-algorithms.svg)](https://pypi.python.org/pypi/graphblas-algorithms/) [![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://github.com/python-graphblas/graphblas-algorithms/blob/main/LICENSE) [![Tests](https://github.com/python-graphblas/graphblas-algorithms/workflows/Tests/badge.svg?branch=main)](https://github.com/python-graphblas/graphblas-algorithms/actions) -[![Coverage](https://coveralls.io/repos/python-graphblas/graphblas-algorithms/badge.svg?branch=main)](https://coveralls.io/r/python-graphblas/graphblas-algorithms) -[![Code style](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black) + diff --git a/graphblas_algorithms/__init__.py b/graphblas_algorithms/__init__.py index 3a60de1..fce1f5d 100644 --- a/graphblas_algorithms/__init__.py +++ b/graphblas_algorithms/__init__.py @@ -1,4 +1,5 @@ from . import _version +from .cluster import average_clustering, clustering, transitivity, triangles # noqa from .link_analysis import pagerank # noqa __version__ = _version.get_versions()["version"] diff --git a/graphblas_algorithms/_utils.py b/graphblas_algorithms/_utils.py new file mode 100644 index 0000000..a633e7c --- /dev/null +++ b/graphblas_algorithms/_utils.py @@ -0,0 +1,46 @@ +import graphblas as gb +from graphblas import Vector, binary + + +def graph_to_adjacency(G, weight=None, dtype=None, *, name=None): + key_to_id = {k: i for i, k in enumerate(G)} + A = gb.io.from_networkx(G, nodelist=key_to_id, weight=weight, dtype=dtype, name=name) + return A, key_to_id + + +def dict_to_vector(d, key_to_id, *, size=None, dtype=None, name=None): + if d is None: + return None + if size is None: + size = len(key_to_id) + indices, values = zip(*((key_to_id[key], val) for key, val in d.items())) + return Vector.from_values(indices, values, size=size, dtype=dtype, name=name) + + +def list_to_vector(nodes, key_to_id, *, size=None, name=None): + if nodes is None: + return None, None + if size is None: + size = len(key_to_id) + id_to_key = {key_to_id[key]: key for key in nodes} + v = Vector.from_values(list(id_to_key), True, size=size, dtype=bool, name=name) + return v, id_to_key + + +def list_to_mask(nodes, key_to_id, *, size=None, name="mask"): + if nodes is None: + return None, None + v, id_to_key = list_to_vector(nodes, key_to_id, size=size, name=name) + return v.S, id_to_key + + +def vector_to_dict(v, key_to_id, id_to_key=None, *, mask=None, fillvalue=None): + # This mutates the vector to fill it! + if id_to_key is None: + id_to_key = {key_to_id[key]: key for key in key_to_id} + if mask is not None: + if fillvalue is not None and v.nvals < mask.parent.nvals: + v(mask, binary.first) << fillvalue + elif fillvalue is not None and v.nvals < v.size: + v(mask=~v.S) << fillvalue + return {id_to_key[index]: value for index, value in zip(*v.to_values(sort=False))} diff --git a/graphblas_algorithms/cluster.py b/graphblas_algorithms/cluster.py new file mode 100644 index 0000000..90745b3 --- /dev/null +++ b/graphblas_algorithms/cluster.py @@ -0,0 +1,180 @@ +import graphblas as gb +import networkx as nx +from graphblas import Matrix, agg, select +from graphblas.semiring import any_pair, plus_pair +from networkx import average_clustering as _nx_average_clustering +from networkx import clustering as _nx_clustering +from networkx.utils import not_implemented_for + +from ._utils import graph_to_adjacency, list_to_mask, vector_to_dict + + +def get_properties(G, names, *, L=None, U=None, degrees=None, has_self_edges=True): + """Calculate properties of undirected graph""" + if isinstance(names, str): + # Separated by commas and/or spaces + names = [name for name in names.replace(" ", ",").split(",") if name] + rv = [] + for name in names: + if name == "L": + if L is None: + L = select.tril(G, -1).new(name="L") + rv.append(L) + elif name == "U": + if U is None: + U = select.triu(G, 1).new(name="U") + rv.append(U) + elif name == "degrees": + if degrees is None: + degrees = get_degrees(G, L=L, U=U, has_self_edges=has_self_edges) + rv.append(degrees) + elif name == "has_self_edges": + # Compute if cheap + if L is not None: + has_self_edges = G.nvals > 2 * L.nvals + elif U is not None: + has_self_edges = G.nvals > 2 * U.nvals + rv.append(has_self_edges) + else: + raise ValueError(f"Unknown property name: {name}") + if len(rv) == 1: + return rv[0] + return rv + + +def get_degrees(G, mask=None, *, L=None, U=None, has_self_edges=True): + if L is not None: + has_self_edges = G.nvals > 2 * L.nvals + elif U is not None: + has_self_edges = G.nvals > 2 * U.nvals + if has_self_edges: + if L is None or U is None: + L, U = get_properties(G, "L U", L=L, U=U) + degrees = ( + L.reduce_rowwise(agg.count).new(mask=mask) + U.reduce_rowwise(agg.count).new(mask=mask) + ).new(name="degrees") + else: + degrees = G.reduce_rowwise(agg.count).new(mask=mask, name="degrees") + return degrees + + +def single_triangle_core(G, index, *, L=None, has_self_edges=True): + M = Matrix(bool, G.nrows, G.ncols) + M[index, index] = True + C = any_pair(G.T @ M.T).new(name="C") # select.coleq(G.T, index) + has_self_edges = get_properties(G, "has_self_edges", L=L, has_self_edges=has_self_edges) + if has_self_edges: + del C[index, index] # Ignore self-edges + R = C.T.new(name="R") + if has_self_edges: + # Pretty much all the time is spent here taking TRIL, which is used to ignore self-edges + L = get_properties(G, "L", L=L) + return plus_pair(L @ R.T).new(mask=C.S).reduce_scalar(allow_empty=False).value + else: + return plus_pair(G @ R.T).new(mask=C.S).reduce_scalar(allow_empty=False).value // 2 + + +def triangles_core(G, mask=None, *, L=None, U=None): + # Ignores self-edges + L, U = get_properties(G, "L U", L=L, U=U) + C = plus_pair(L @ L.T).new(mask=L.S) + return ( + C.reduce_rowwise().new(mask=mask) + + C.reduce_columnwise().new(mask=mask) + + plus_pair(U @ L.T).new(mask=U.S).reduce_rowwise().new(mask=mask) + ).new(name="triangles") + + +@not_implemented_for("directed") +def triangles(G, nodes=None): + if len(G) == 0: + return {} + A, key_to_id = graph_to_adjacency(G, dtype=bool) + if nodes in G: + return single_triangle_core(A, key_to_id[nodes]) + mask, id_to_key = list_to_mask(nodes, key_to_id) + result = triangles_core(A, mask=mask) + return vector_to_dict(result, key_to_id, id_to_key, mask=mask, fillvalue=0) + + +def total_triangles_core(G, *, L=None, U=None): + # We use SandiaDot method, because it's usually the fastest on large graphs. + # For smaller graphs, Sandia method is usually faster: plus_pair(L @ L).new(mask=L.S) + L, U = get_properties(G, "L U", L=L, U=U) + return plus_pair(L @ U.T).new(mask=L.S).reduce_scalar(allow_empty=False).value + + +def transitivity_core(G, *, L=None, U=None, degrees=None): + L, U = get_properties(G, "L U", L=L, U=U) + numerator = total_triangles_core(G, L=L, U=U) + if numerator == 0: + return 0 + degrees = get_properties(G, "degrees", L=L, U=U, degrees=degrees) + denom = (degrees * (degrees - 1)).reduce().value + return 6 * numerator / denom + + +@not_implemented_for("directed") # Should we implement it for directed? +def transitivity(G): + if len(G) == 0: + return 0 + A = gb.io.from_networkx(G, weight=None, dtype=bool) + return transitivity_core(A) + + +def clustering_core(G, mask=None, *, L=None, U=None, degrees=None): + L, U = get_properties(G, "L U", L=L, U=U) + tri = triangles_core(G, mask=mask, L=L, U=U) + degrees = get_degrees(G, mask=mask, L=L, U=U) + denom = degrees * (degrees - 1) + return (2 * tri / denom).new(name="clustering") + + +def single_clustering_core(G, index, *, L=None, degrees=None, has_self_edges=True): + has_self_edges = get_properties(G, "has_self_edges", L=L, has_self_edges=has_self_edges) + tri = single_triangle_core(G, index, L=L, has_self_edges=has_self_edges) + if tri == 0: + return 0 + if degrees is not None: + degrees = degrees[index].value + else: + row = G[index, :].new() + degrees = row.reduce(agg.count).value + if has_self_edges and row[index].value is not None: + degrees -= 1 + denom = degrees * (degrees - 1) + return 2 * tri / denom + + +def clustering(G, nodes=None, weight=None): + if len(G) == 0: + return {} + if isinstance(G, nx.DiGraph) or weight is not None: + # TODO: Not yet implemented. Clustering implemented only for undirected and unweighted. + return _nx_clustering(G, nodes=nodes, weight=weight) + A, key_to_id = graph_to_adjacency(G, weight=weight) + if nodes in G: + return single_clustering_core(A, key_to_id[nodes]) + mask, id_to_key = list_to_mask(nodes, key_to_id) + result = clustering_core(A, mask=mask) + return vector_to_dict(result, key_to_id, id_to_key, mask=mask, fillvalue=0.0) + + +def average_clustering_core(G, mask=None, count_zeros=True, *, L=None, U=None, degrees=None): + c = clustering_core(G, mask=mask, L=L, U=U, degrees=degrees) + val = c.reduce(allow_empty=False).value + if not count_zeros: + return val / c.nvals + elif mask is not None: + return val / mask.parent.nvals + else: + return val / c.size + + +def average_clustering(G, nodes=None, weight=None, count_zeros=True): + if len(G) == 0 or isinstance(G, nx.DiGraph) or weight is not None: + # TODO: Not yet implemented. Clustering implemented only for undirected and unweighted. + return _nx_average_clustering(G, nodes=nodes, weight=weight, count_zeros=count_zeros) + A, key_to_id = graph_to_adjacency(G, weight=weight) + mask, _ = list_to_mask(nodes, key_to_id) + return average_clustering_core(A, mask=mask, count_zeros=count_zeros) diff --git a/graphblas_algorithms/link_analysis.py b/graphblas_algorithms/link_analysis.py index 3ed0e62..bf8389d 100644 --- a/graphblas_algorithms/link_analysis.py +++ b/graphblas_algorithms/link_analysis.py @@ -1,11 +1,11 @@ -from collections import OrderedDict from warnings import warn -import graphblas as gb import networkx as nx from graphblas import Vector, binary, unary from graphblas.semiring import plus_first, plus_times +from ._utils import dict_to_vector, graph_to_adjacency, vector_to_dict + def pagerank_core( A, @@ -44,7 +44,7 @@ def pagerank_core( # Inverse of row_degrees # Fold alpha constant into S if row_degrees is None: - S = A.reduce_rowwise().new(float, name="S") + S = A.reduce_rowwise().new(float, name="S") # XXX: What about self-edges S << alpha / S else: S = (alpha / row_degrees).new(name="S") @@ -119,26 +119,15 @@ def pagerank( N = len(G) if N == 0: return {} - node_ids = OrderedDict((k, i) for i, k in enumerate(G)) - A = gb.io.from_networkx(G, nodelist=node_ids, weight=weight, dtype=float) - - x = p = dangling_weights = None - # Initial vector (we'll normalize later) - if nstart is not None: - indices, values = zip(*((node_ids[key], val) for key, val in nstart.items())) - x = Vector.from_values(indices, values, size=N, dtype=float, name="nstart") - # Personalization vector (we'll normalize later) - if personalization is not None: - indices, values = zip(*((node_ids[key], val) for key, val in personalization.items())) - p = Vector.from_values(indices, values, size=N, dtype=float, name="personalization") - # Dangling nodes (we'll normalize later) - row_degrees = A.reduce_rowwise().new(name="row_degrees") - if dangling is not None: - if row_degrees.nvals < N: # is_dangling - indices, values = zip(*((node_ids[key], val) for key, val in dangling.items())) - dangling_weights = Vector.from_values( - indices, values, size=N, dtype=float, name="dangling" - ) + A, key_to_id = graph_to_adjacency(G, weight=weight, dtype=float) + # We'll normalize initial, personalization, and dangling vectors later + x = dict_to_vector(nstart, key_to_id, dtype=float, name="nstart") + p = dict_to_vector(personalization, key_to_id, dtype=float, name="personalization") + row_degrees = A.reduce_rowwise().new(name="row_degrees") # XXX: What about self-edges? + if dangling is not None and row_degrees.nvals < N: + dangling_weights = dict_to_vector(dangling, key_to_id, dtype=float, name="dangling") + else: + dangling_weights = None result = pagerank_core( A, alpha=alpha, @@ -149,7 +138,4 @@ def pagerank( dangling=dangling_weights, row_degrees=row_degrees, ) - if result.nvals != N: - # Not likely, but fill with 0 just in case - result(mask=~result.S) << 0 - return dict(zip(node_ids, result.to_values()[1])) + return vector_to_dict(result, key_to_id, fillvalue=0.0) diff --git a/graphblas_algorithms/tests/test_cluster.py b/graphblas_algorithms/tests/test_cluster.py new file mode 100644 index 0000000..af5ec59 --- /dev/null +++ b/graphblas_algorithms/tests/test_cluster.py @@ -0,0 +1,92 @@ +import inspect + +import graphblas as gb +import networkx as nx + +import graphblas_algorithms as ga +from graphblas_algorithms import average_clustering, clustering, transitivity, triangles + +nx_triangles = nx.triangles +nx.triangles = triangles +nx.algorithms.triangles = triangles +nx.algorithms.cluster.triangles = triangles + +nx_transitivity = nx.transitivity +nx.transitivity = transitivity +nx.algorithms.transitivity = transitivity +nx.algorithms.cluster.transitivity = transitivity + +nx_clustering = nx.clustering +nx.clustering = clustering +nx.algorithms.clustering = clustering +nx.algorithms.cluster.clustering = clustering + +nx_average_clustering = nx.average_clustering +nx.average_clustering = average_clustering +nx.algorithms.average_clustering = average_clustering +nx.algorithms.cluster.average_clustering = average_clustering + + +def test_signatures(): + nx_sig = inspect.signature(nx_triangles) + sig = inspect.signature(triangles) + assert nx_sig == sig + nx_sig = inspect.signature(nx_transitivity) + sig = inspect.signature(transitivity) + assert nx_sig == sig + nx_sig = inspect.signature(nx_clustering) + sig = inspect.signature(clustering) + assert nx_sig == sig + + +def test_triangles_full(): + # Including self-edges! + G = gb.Matrix(bool, 5, 5) + G[:, :] = True + G2 = gb.select.offdiag(G).new() + L = gb.select.tril(G, -1).new(name="L") + U = gb.select.triu(G, 1).new(name="U") + result = ga.cluster.triangles_core(G, L=L, U=U) + expected = gb.Vector(int, 5) + expected[:] = 6 + assert result.isequal(expected) + result = ga.cluster.triangles_core(G2, L=L, U=U) + assert result.isequal(expected) + mask = gb.Vector(bool, 5) + mask[0] = True + mask[3] = True + result = ga.cluster.triangles_core(G, mask=mask.S) + expected = gb.Vector(int, 5) + expected[0] = 6 + expected[3] = 6 + assert result.isequal(expected) + result = ga.cluster.triangles_core(G2, mask=mask.S) + assert result.isequal(expected) + assert ga.cluster.single_triangle_core(G, 1) == 6 + assert ga.cluster.single_triangle_core(G, 0, L=L) == 6 + assert ga.cluster.single_triangle_core(G2, 0, has_self_edges=False) == 6 + assert ga.cluster.total_triangles_core(G2) == 10 + assert ga.cluster.total_triangles_core(G) == 10 + assert ga.cluster.total_triangles_core(G, L=L, U=U) == 10 + assert ga.cluster.transitivity_core(G) == 1.0 + assert ga.cluster.transitivity_core(G2) == 1.0 + result = ga.cluster.clustering_core(G) + expected = gb.Vector(float, 5) + expected[:] = 1 + assert result.isequal(expected) + result = ga.cluster.clustering_core(G2) + assert result.isequal(expected) + assert ga.cluster.single_clustering_core(G, 0) == 1 + assert ga.cluster.single_clustering_core(G2, 0) == 1 + expected(mask.S, replace=True) << 1 + result = ga.cluster.clustering_core(G, mask=mask.S) + assert result.isequal(expected) + result = ga.cluster.clustering_core(G2, mask=mask.S) + assert result.isequal(expected) + assert ga.cluster.average_clustering_core(G) == 1 + assert ga.cluster.average_clustering_core(G2) == 1 + assert ga.cluster.average_clustering_core(G, mask=mask.S) == 1 + assert ga.cluster.average_clustering_core(G2, mask=mask.S) == 1 + + +from networkx.algorithms.tests.test_cluster import * # noqa isort:skip diff --git a/requirements.txt b/requirements.txt index ee57e55..d6ac249 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1 +1,2 @@ -python-graphblas >=2022.4.1 +python-graphblas >=2022.4.2 +networkx diff --git a/setup.py b/setup.py index ddcfd3f..e83e92d 100644 --- a/setup.py +++ b/setup.py @@ -7,6 +7,8 @@ } extras_require["complete"] = sorted({v for req in extras_require.values() for v in req}) +with open("requirements.txt") as f: + install_requires = f.read().strip().split("\n") with open("README.md") as f: long_description = f.read() @@ -22,7 +24,7 @@ url="https://github.com/python-graphblas/graphblas-algorithms", packages=find_packages(), python_requires=">=3.8", - install_requires=["python-graphblas >=2022.4.1", "networkx"], + install_requires=install_requires, extras_require=extras_require, include_package_data=True, license="Apache License 2.0",