Source code for stellargraph.mapper.full_batch_generators

# -*- coding: utf-8 -*-
#
# Copyright 2018-2020 Data61, CSIRO
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""
Mappers to provide input data for the graph models in layers.

"""
__all__ = [
    "FullBatchGenerator",
    "FullBatchNodeGenerator",
    "FullBatchLinkGenerator",
    "RelationalFullBatchNodeGenerator",
]

import warnings
import operator
import random
import numpy as np
import itertools as it
import networkx as nx
import scipy.sparse as sps
from tensorflow.keras import backend as K
from functools import reduce
from tensorflow.keras.utils import Sequence

from . import (
    Generator,
    FullBatchSequence,
    SparseFullBatchSequence,
    RelationalFullBatchNodeSequence,
    GraphSAGENodeGenerator,
    DirectedGraphSAGENodeGenerator,
)
from ..core.graph import StellarGraph
from ..core.utils import is_real_iterable
from ..core.utils import GCN_Aadj_feats_op, PPNP_Aadj_feats_op
from ..core.validation import comma_sep


class FullBatchGenerator(Generator):
    multiplicity = None

    def __init__(
        self,
        G,
        name=None,
        method="gcn",
        k=1,
        sparse=True,
        transform=None,
        teleport_probability=0.1,
        weighted=False,
    ):
        if self.multiplicity is None:
            raise TypeError(
                "Can't instantiate abstract class 'FullBatchGenerator', please"
                "instantiate either 'FullBatchNodeGenerator' or 'FullBatchLinkGenerator'"
            )

        if not isinstance(G, StellarGraph):
            raise TypeError("Graph must be a StellarGraph or StellarDiGraph object.")

        self.graph = G
        self.name = name
        self.k = k
        self.teleport_probability = teleport_probability
        self.method = method

        # Check if the graph has features
        G.check_graph_for_ml()

        # Check that there is only a single node type for GAT or GCN
        node_type = G.unique_node_type(
            "G: expected a graph with a single node type, found a graph with node types: %(found)s"
        )

        # Create sparse adjacency matrix:
        # Use the node orderings the same as in the graph features
        self.node_list = G.nodes()
        self.Aadj = G.to_adjacency_matrix(weighted=weighted)

        # Power-user feature: make the generator yield dense adjacency matrix instead
        # of the default sparse one.
        # If sparse is specified, check that the backend is tensorflow
        if sparse and K.backend() != "tensorflow":
            warnings.warn(
                "Sparse adjacency matrices are only supported in tensorflow."
                " Falling back to using a dense adjacency matrix."
            )
            self.use_sparse = False

        else:
            self.use_sparse = sparse

        # Get the features for the nodes
        self.features = G.node_features(node_type=node_type)

        if transform is not None:
            if callable(transform):
                self.features, self.Aadj = transform(
                    features=self.features, A=self.Aadj
                )
            else:
                raise ValueError("argument 'transform' must be a callable.")

        elif self.method in ["gcn", "sgc"]:
            self.features, self.Aadj = GCN_Aadj_feats_op(
                features=self.features, A=self.Aadj, k=self.k, method=self.method
            )

        elif self.method in ["gat", "self_loops"]:
            self.Aadj = self.Aadj + sps.diags(
                np.ones(self.Aadj.shape[0]) - self.Aadj.diagonal()
            )

        elif self.method in ["ppnp"]:
            if self.use_sparse:
                raise ValueError(
                    "sparse: method='ppnp' requires 'sparse=False', found 'sparse=True' "
                    "(consider using the APPNP model for sparse support)"
                )
            self.features, self.Aadj = PPNP_Aadj_feats_op(
                features=self.features,
                A=self.Aadj,
                teleport_probability=self.teleport_probability,
            )

        elif self.method in [None, "none"]:
            pass

        else:
            raise ValueError(
                "Undefined method for adjacency matrix transformation. "
                "Accepted: 'gcn' (default), 'sgc', and 'self_loops'."
            )

    def num_batch_dims(self):
        return 2

    def flow(self, node_ids, targets=None, use_ilocs=False):
        """
        Creates a generator/sequence object for training or evaluation
        with the supplied node ids and numeric targets.

        Args:
            node_ids: an iterable of node ids for the nodes of interest
                (e.g., training, validation, or test set nodes)
            targets: a 1D or 2D array of numeric node targets with shape ``(len(node_ids),)``
                or ``(len(node_ids), target_size)``
            use_ilocs (bool): if True, node_ids are represented by ilocs,
                otherwise node_ids need to be transformed into ilocs

        Returns:
            A NodeSequence object to use with GCN or GAT models
            in Keras methods :meth:`fit`, :meth:`evaluate`,
            and :meth:`predict`

        """
        if targets is not None:
            # Check targets is an iterable
            if not is_real_iterable(targets):
                raise TypeError("Targets must be an iterable or None")

            # Check targets correct shape
            if len(targets) != len(node_ids):
                raise TypeError("Targets must be the same length as node_ids")

        # find the indices of the nodes, handling both multiplicity 1 [node, node, ...] and 2
        # [(source, target), ...]
        node_ids = np.asarray(node_ids)
        if use_ilocs:
            node_indices = node_ids
        else:
            flat_node_ids = node_ids.reshape(-1)
            flat_node_indices = self.graph.node_ids_to_ilocs(flat_node_ids)
            # back to the original shape
            node_indices = flat_node_indices.reshape(node_ids.shape)
        if self.use_sparse:
            return SparseFullBatchSequence(
                self.features, self.Aadj, targets, node_indices
            )
        else:
            return FullBatchSequence(self.features, self.Aadj, targets, node_indices)


[docs]class FullBatchNodeGenerator(FullBatchGenerator):
    """
    A data generator for use with full-batch models on homogeneous graphs,
    e.g., GCN, GAT, SGC.
    The supplied graph G should be a StellarGraph object with node features.

    Use the :meth:`flow` method supplying the nodes and (optionally) targets
    to get an object that can be used as a Keras data generator.

    This generator will supply the features array and the adjacency matrix to a
    full-batch Keras graph ML model.  There is a choice to supply either a sparse
    adjacency matrix (the default) or a dense adjacency matrix, with the `sparse`
    argument.

    For these algorithms the adjacency matrix requires preprocessing and the
    'method' option should be specified with the correct preprocessing for
    each algorithm. The options are as follows:

    *   ``method='gcn'``: Normalizes the adjacency matrix for the GCN algorithm.
        This implements the linearized convolution of Eq. 8 in [1].
    *   ``method='sgc'``: This replicates the k-th order smoothed adjacency matrix
        to implement the Simplified Graph Convolutions of Eq. 8 in [2].
    *   ``method='self_loops'`` or ``method='gat'``: Simply sets the diagonal elements
        of the adjacency matrix to one, effectively adding self-loops to the graph. This is
        used by the GAT algorithm of [3].
    *   ``method='ppnp'``: Calculates the personalized page rank matrix of Eq. 2 in [4].

    [1] `Kipf and Welling, 2017 <https://arxiv.org/abs/1609.02907>`_.
    [2] `Wu et al. 2019 <https://arxiv.org/abs/1902.07153>`_.
    [3] `Veličković et al., 2018 <https://arxiv.org/abs/1710.10903>`_.
    [4] `Klicpera et al., 2018 <https://arxiv.org/abs/1810.05997>`_.

    Example::

        G_generator = FullBatchNodeGenerator(G)
        train_flow = G_generator.flow(node_ids, node_targets)

        # Fetch the data from train_flow, and feed into a Keras model:
        x_inputs, y_train = train_flow[0]
        model.fit(x=x_inputs, y=y_train)

        # Alternatively, use the generator itself with model.fit:
        model.fit(train_flow, epochs=num_epochs)


    .. seealso::

       Models using this generator: :class:`.GCN`, :class:`.GAT`, :class:`.APPNP`, :class:`.PPNP`.

       Example using this generator (see individual models for more): `node classification <https://stellargraph.readthedocs.io/en/stable/demos/node-classification/gcn-node-classification.html>`__.

       Related generators:

       - :class:`.ClusterNodeGenerator` for scalable/inductive training
       - :class:`.CorruptedGenerator` for unsupervised training with :class:`.DeepGraphInfomax`
       - :class:`.FullBatchLinkGenerator` for link prediction and similar tasks
       - :class:`.RelationalFullBatchNodeGenerator` for multiple edge types, with :class:`.RGCN`
       - :class:`.PaddedGraphGenerator` for graph classification

    Args:
        G (StellarGraph): a machine-learning StellarGraph-type graph
        name (str): an optional name of the generator
        method (str): Method to preprocess adjacency matrix. One of ``gcn`` (default),
            ``sgc``, ``self_loops``, or ``none``.
        k (None or int): This is the smoothing order for the ``sgc`` method. This should be positive
            integer.
        transform (callable): an optional function to apply on features and adjacency matrix
            the function takes ``(features, Aadj)`` as arguments.
        sparse (bool): If True (default) a sparse adjacency matrix is used,
            if False a dense adjacency matrix is used.
        teleport_probability (float): teleport probability between 0.0 and 1.0.
            "probability" of returning to the starting node in the propagation step as in [4].
        weighted (bool, optional): if True, use the edge weights from ``G``; if False, treat the
            graph as unweighted.
    """

    multiplicity = 1

[docs]    def flow(self, node_ids, targets=None, use_ilocs=False):
        """
        Creates a generator/sequence object for training or evaluation
        with the supplied node ids and numeric targets.

        Args:
            node_ids: an iterable of node ids for the nodes of interest
                (e.g., training, validation, or test set nodes)
            targets: a 1D or 2D array of numeric node targets with shape ``(len(node_ids),)``
                or ``(len(node_ids), target_size)``
            use_ilocs (bool): if True, node_ids are represented by ilocs,
                otherwise node_ids need to be transformed into ilocs

        Returns:
            A NodeSequence object to use with GCN or GAT models
            in Keras methods :meth:`fit`, :meth:`evaluate`,
            and :meth:`predict`

        """
        return super().flow(node_ids, targets, use_ilocs)

[docs]    def default_corrupt_input_index_groups(self):
        return [[0]]


[docs]class FullBatchLinkGenerator(FullBatchGenerator):
    """
    A data generator for use with full-batch models on homogeneous graphs,
    e.g., GCN, GAT, SGC.
    The supplied graph G should be a StellarGraph object with node features.

    Use the :meth:`flow` method supplying the links as a list of ``(src, dst)`` tuples
    of node IDs and (optionally) targets.

    This generator will supply the features array and the adjacency matrix to a
    full-batch Keras graph ML model.  There is a choice to supply either a sparse
    adjacency matrix (the default) or a dense adjacency matrix, with the `sparse`
    argument.

    For these algorithms the adjacency matrix requires preprocessing and the
    'method' option should be specified with the correct preprocessing for
    each algorithm. The options are as follows:

    *   ``method='gcn'``: Normalizes the adjacency matrix for the GCN algorithm.
        This implements the linearized convolution of Eq. 8 in [1].
    *   ``method='sgc'``: This replicates the k-th order smoothed adjacency matrix
        to implement the Simplified Graph Convolutions of Eq. 8 in [2].
    *   ``method='self_loops'`` or ``method='gat'``: Simply sets the diagonal elements
        of the adjacency matrix to one, effectively adding self-loops to the graph. This is
        used by the GAT algorithm of [3].
    *   ``method='ppnp'``: Calculates the personalized page rank matrix of Eq. 2 in [4].

    [1] `Kipf and Welling, 2017 <https://arxiv.org/abs/1609.02907>`_.
    [2] `Wu et al. 2019 <https://arxiv.org/abs/1902.07153>`_.
    [3] `Veličković et al., 2018 <https://arxiv.org/abs/1710.10903>`_.
    [4] `Klicpera et al., 2018 <https://arxiv.org/abs/1810.05997>`_.

    Example::

        G_generator = FullBatchLinkGenerator(G)
        train_flow = G_generator.flow([(1,2), (3,4), (5,6)], [0, 1, 1])

        # Fetch the data from train_flow, and feed into a Keras model:
        x_inputs, y_train = train_flow[0]
        model.fit(x=x_inputs, y=y_train)

        # Alternatively, use the generator itself with model.fit:
        model.fit(train_flow, epochs=num_epochs)

    .. seealso::

       Models using this generator: :class:`.GCN`, :class:`.GAT`, :class:`.APPNP`, :class:`.PPNP`.

       Example using this generator: `link classification with GCN <https://stellargraph.readthedocs.io/en/stable/demos/link-prediction/gcn-link-prediction.html>`__.

       Related generator: :class:`.FullBatchNodeGenerator` for node classification and similar tasks.

    Args:
        G (StellarGraph): a machine-learning StellarGraph-type graph
        name (str): an optional name of the generator
        method (str): Method to preprocess adjacency matrix. One of ``gcn`` (default),
            ``sgc``, ``self_loops``, or ``none``.
        k (None or int): This is the smoothing order for the ``sgc`` method. This should be positive
            integer.
        transform (callable): an optional function to apply on features and adjacency matrix
            the function takes ``(features, Aadj)`` as arguments.
        sparse (bool): If True (default) a sparse adjacency matrix is used,
            if False a dense adjacency matrix is used.
        teleport_probability (float): teleport probability between 0.0 and 1.0. "probability"
            of returning to the starting node in the propagation step as in [4].
        weighted (bool, optional): if True, use the edge weights from ``G``; if False, treat the
            graph as unweighted.
    """

    multiplicity = 2

[docs]    def flow(self, link_ids, targets=None, use_ilocs=False):
        """
        Creates a generator/sequence object for training or evaluation
        with the supplied node ids and numeric targets.

        Args:
            link_ids: an iterable of link ids specified as tuples of node ids
                or an array of shape (N_links, 2) specifying the links.
            targets: a 1D or 2D array of numeric node targets with shape ``(len(node_ids),)``
                or ``(len(node_ids), target_size)``
            use_ilocs (bool): if True, node_ids are represented by ilocs,
                otherwise node_ids need to be transformed into ilocs

        Returns:
            A NodeSequence object to use with GCN or GAT models
            in Keras methods :meth:`fit`, :meth:`evaluate`,
            and :meth:`predict`

        """
        return super().flow(link_ids, targets, use_ilocs)


[docs]class RelationalFullBatchNodeGenerator(Generator):
    """
    A data generator for use with full-batch models on relational graphs e.g. RGCN.

    The supplied graph G should be a StellarGraph or StellarDiGraph object with node features.
    Use the :meth:`flow` method supplying the nodes and (optionally) targets
    to get an object that can be used as a Keras data generator.

    This generator will supply the features array and the adjacency matrix to a
    full-batch Keras graph ML model.  There is a choice to supply either a list of sparse
    adjacency matrices (the default) or a list of dense adjacency matrices, with the `sparse`
    argument.

    For these algorithms the adjacency matrices require preprocessing and the default option is to
    normalize each row of the adjacency matrix so that it sums to 1.
    For customization a transformation (callable) can be passed that
    operates on the node features and adjacency matrix.

    Example::

        G_generator = RelationalFullBatchNodeGenerator(G)
        train_data_gen = G_generator.flow(node_ids, node_targets)

        # Fetch the data from train_data_gen, and feed into a Keras model:
        # Alternatively, use the generator itself with model.fit:
        model.fit(train_gen, epochs=num_epochs, ...)

    .. seealso::

       Model using this generator: :class:`.RGCN`.

       Examples using this generator:

       - `node classification <https://stellargraph.readthedocs.io/en/stable/demos/node-classification/rgcn-node-classification.html>`__
       - `unsupervised representation learning with Deep Graph Infomax <https://stellargraph.readthedocs.io/en/stable/demos/embeddings/deep-graph-infomax-embeddings.html>`__

       Related generators:

       - :class:`.FullBatchNodeGenerator` for graphs with one edge type
       - :class:`.CorruptedGenerator` for unsupervised training with :class:`.DeepGraphInfomax`

    Args:
        G (StellarGraph): a machine-learning StellarGraph-type graph
        name (str): an optional name of the generator
        transform (callable): an optional function to apply on features and adjacency matrix
            the function takes ``(features, Aadj)`` as arguments.
        sparse (bool): If True (default) a list of sparse adjacency matrices is used,
            if False a list of dense adjacency matrices is used.
        weighted (bool, optional): if True, use the edge weights from ``G``; if False, treat the
            graph as unweighted.
    """

    def __init__(self, G, name=None, sparse=True, transform=None, weighted=False):

        if not isinstance(G, StellarGraph):
            raise TypeError("Graph must be a StellarGraph object.")

        self.graph = G
        self.name = name
        self.use_sparse = sparse
        self.multiplicity = 1

        # Check if the graph has features
        G.check_graph_for_ml()

        # extract node, feature, and edge type info from G
        node_types = list(G.node_types)
        if len(node_types) != 1:
            raise ValueError(
                f"G: expected one node type, found {comma_sep(sorted(node_types))}",
            )

        self.features = G.node_features(node_type=node_types[0])

        # create a list of adjacency matrices - one adj matrix for each edge type
        # an adjacency matrix is created for each edge type from all edges of that type
        self.As = []

        for edge_type in G.edge_types:
            # note that A is the transpose of the standard adjacency matrix
            # this is to aggregate features from incoming nodes
            A = G.to_adjacency_matrix(
                edge_type=edge_type, weighted=weighted
            ).transpose()

            if transform is None:
                # normalize here and replace zero row sums with 1
                # to avoid harmless divide by zero warnings
                d = sps.diags(
                    np.float_power(np.ravel(np.maximum(A.sum(axis=1), 1)), -1), 0
                )
                A = d.dot(A)

            else:
                self.features, A = transform(self.features, A)

            A = A.tocoo()
            self.As.append(A)

[docs]    def num_batch_dims(self):
        return 2

[docs]    def flow(self, node_ids, targets=None):
        """
        Creates a generator/sequence object for training or evaluation
        with the supplied node ids and numeric targets.

        Args:
            node_ids: and iterable of node ids for the nodes of interest
                (e.g., training, validation, or test set nodes)
            targets: a 2D array of numeric node targets with shape ``(len(node_ids), target_size)``

        Returns:
            A NodeSequence object to use with RGCN models
            in Keras methods :meth:`fit`, :meth:`evaluate`,
            and :meth:`predict`
        """

        if targets is not None:
            # Check targets is an iterable
            if not is_real_iterable(targets):
                raise TypeError("Targets must be an iterable or None")

            # Check targets correct shape
            if len(targets) != len(node_ids):
                raise TypeError("Targets must be the same length as node_ids")

        node_indices = self.graph.node_ids_to_ilocs(node_ids)

        return RelationalFullBatchNodeSequence(
            self.features, self.As, self.use_sparse, targets, node_indices
        )

[docs]    def default_corrupt_input_index_groups(self):
        return [[0]]