Source code for stellargraph.core.graph

# -*- coding: utf-8 -*-
#
# Copyright 2017-2020 Data61, CSIRO
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
The StellarGraph class that encapsulates information required for
a machine-learning ready graph used by models.

"""
__all__ = ["StellarGraph", "StellarDiGraph", "GraphSchema", "NeighbourWithWeight"]

from typing import Iterable, Any, Mapping, List, Optional, Set
from collections import defaultdict, namedtuple
import pandas as pd
import numpy as np
import scipy.sparse as sps
import warnings

from .. import globalvar
from .schema import GraphSchema, EdgeType
from .experimental import experimental, ExperimentalWarning
from .element_data import NodeData, EdgeData, ExternalIdIndex
from .utils import is_real_iterable
from .validation import comma_sep
from . import convert


NeighbourWithWeight = namedtuple("NeighbourWithWeight", ["node", "weight"])


[docs]class StellarGraph:
    """
    StellarGraph class for graph machine learning.

    Summary of a StellarGraph and the terminology used:

    - it stores graph structure, as a collection of *nodes* and a collection of *edges* that connect
      a *source* node to a *target* node

    - each node and edge has an associated *type*

    - each node has a numeric vector of *features*, and the vectors of all nodes with the same type
      have the same dimension

    - it is *homogeneous* if there is only one type of node and one type of edge

    - it is *heterogeneous* if it is not homgeneous (more than one type of node, or more than
      one type of edge)

    - it is *directed* if the direction of an edge starting at its source node and finishing at
      its target node is important

    - it is *undirected* if the direction does not matter

    - every StellarGraph can be a *multigraph*, meaning there can be multiple edges between any two
      nodes

    To create a StellarGraph object, at a minimum pass the nodes and edges as Pandas
    DataFrames. Each row of the nodes DataFrame represents a node in the graph, where the index is
    the ID of the node. Each row of the edges DataFrame represents an edge, where the index is the
    ID of the edge, and the ``source`` and ``target`` columns store the node ID of the source and
    target nodes.

    For example, suppose we're modelling a graph that's a square with a diagonal::

        a -- b
        | \\  |
        |  \\ |
        d -- c

    The DataFrames might look like::

        nodes = pd.DataFrame([], index=["a", "b", "c", "d"])
        edges = pd.DataFrame(
            {"source": ["a", "b", "c", "d", "a"], "target": ["b", "c", "d", "a", "c"]}
        )

    If this data represents an undirected graph (the ordering of each edge source/target doesn't
    matter)::

        Gs = StellarGraph(nodes, edges)


    If this data represents a directed graph (the ordering does matter)::

        Gs = StellarDiGraph(nodes, edges)


    Numeric node features are taken as any columns of the nodes DataFrame. For example, if the graph
    above has two features ``x`` and ``y`` associated with each node::

        nodes = pd.DataFrame(
            {"x": [-1, 2, -3, 4], "y": [0.4, 0.1, 0.9, 0]}, index=["a", "b", "c", "d"]
        )

    Edge weights are taken as the optional ``weight`` column of the edges DataFrame::

        edges = pd.DataFrame({
            "source": ["a", "b", "c", "d", "a"],
            "target": ["b", "c", "d", "a", "c"],
            "weight": [10, 0.5, 1, 3, 13]
        })

    Heterogeneous graphs, with multiple node or edge types, can be created by passing multiple
    DataFrames in a dictionary. The dictionary keys are the names/identifiers for the type. For
    example, if the graph above has node ``a`` of type ``foo``, and the rest as type ``bar``, the
    construction might look like::

        foo_nodes = pd.DataFrame({"x": [-1]}, index=["a"])
        bar_nodes = pd.DataFrame(
            {"y": [0.4, 0.1, 0.9], "z": [100, 200, 300]}, index=["b", "c", "d"]
        )

        StellarGraph({"foo": foo_nodes, "bar": bar_nodes}, edges)

    Notice the ``foo`` node has one feature ``x``, while the ``bar`` nodes have 2 features ``y`` and
    ``z``. A heterogeneous graph can have different features for each type.

    Edges of different types work in the same way. example instance, if edges have different types based
    on their orientation::

        horizontal_edges = pd.DataFrame(
            {"source": ["a", "c"], "target": ["b", "d"]}, index=[0, 2]
        )
        vertical_edges = pd.DataFrame(
            {"source": ["b", "d"], "target": ["c", "a"]}, index=[1, 3]
        )
        diagonal_edges = pd.DataFrame({"source": ["a"], "target": ["c"]}, index=[4])

        StellarGraph(nodes, {"h": horizontal_edges, "v": vertical_edges, "d": diagonal_edges})

    A dictionary can be passed for both arguments::

        StellarGraph(
            {"foo": foo_nodes, "bar": bar_nodes},
            {"h": horizontal_edges, "v": vertical_edges, "d": diagonal_edges}
        )

    .. note::

        The IDs of nodes must be unique across all types: for example, it is an error to have a node
        0 of type ``a``, and a node 0 of type ``b``. IDs of edges must also be unique across all
        types.

    .. seealso:: :meth:`from_networkx` for construction from a NetworkX graph.

    Args:
        nodes (DataFrame or dict of hashable to Pandas DataFrame, optional):
            Features for every node in the graph. Any columns in the DataFrame are taken as numeric
            node features of type ``dtype``. If there is only one type of node, a DataFrame can be
            passed directly, and the type defaults to the ``node_type_default`` parameter. Nodes
            have an ID taken from the index of the dataframe, and they have to be unique across all
            types.  For nodes with no features, an appropriate DataFrame can be created with
            ``pandas.DataFrame([], index=node_ids)``, where ``node_ids`` is a list of the node
            IDs.

        edges (DataFrame or dict of hashable to Pandas DataFrame, optional):
            An edge list for each type of edges as a Pandas DataFrame containing a source, target
            and (optionally) weight column (the names of each are taken from the ``source_column``,
            ``target_column`` and ``edge_weight_column`` parameters). If there is only one type of
            edges, a DataFrame can be passed directly, and the type defaults to the
            ``edge_type_default`` parameter. Edges have an ID taken from the index of the dataframe,
            and they have to be unique across all types.

        is_directed (bool, optional):
            If True, the data represents a directed multigraph, otherwise an undirected multigraph.

        source_column (str, optional):
            The name of the column to use as the source node of edges in the ``edges`` edge list
            argument.

        target_column (str, optional):
            The name of the column to use as the target node of edges in the ``edges`` edge list
            argument.

        edge_weight_column (str, optional):
            The name of the column in each of the ``edges`` DataFrames to use as the weight of
            edges. If the column does not exist in any of them, it is defaulted to ``1``.

        node_type_default (str, optional):
            The default node type to use, if ``nodes`` is passed as a DataFrame (not a ``dict``).

        edge_type_default (str, optional):
            The default edge type to use, if ``edges`` is passed as a DataFrame (not a ``dict``).

        dtype (numpy data-type, optional):
            The numpy data-type to use for the features extracted from each of the ``nodes`` DataFrames.

        graph:
            Deprecated, use :meth:`from_networkx`.
        node_type_name:
            Deprecated, use :meth:`from_networkx`.
        edge_type_name:
            Deprecated, use :meth:`from_networkx`.
        edge_weight_label:
            Deprecated, use :meth:`from_networkx`.
        node_features:
            Deprecated, use :meth:`from_networkx`.
    """

    def __init__(
        self,
        nodes=None,
        edges=None,
        *,
        is_directed=False,
        source_column=globalvar.SOURCE,
        target_column=globalvar.TARGET,
        edge_weight_column=globalvar.WEIGHT,
        node_type_default=globalvar.NODE_TYPE_DEFAULT,
        edge_type_default=globalvar.EDGE_TYPE_DEFAULT,
        dtype="float32",
        # legacy arguments:
        graph=None,
        node_type_name=globalvar.TYPE_ATTR_NAME,
        edge_type_name=globalvar.TYPE_ATTR_NAME,
        edge_weight_label=None,
        node_features=None,
    ):
        import networkx

        # support for legacy arguments, translate to the new form
        if edge_weight_label is not None:
            # `edge_weight_label` -> `edge_weight_column`
            warnings.warn(
                "the 'edge_weight_label' parameter has been replaced by 'edge_weight_column'",
                DeprecationWarning,
            )
            edge_weight_column = edge_weight_label

        if isinstance(nodes, networkx.Graph):
            # `StellarGraph(nx_graph)` -> `graph`
            graph = nodes
            nodes = None
            if edges is not None:
                raise ValueError(
                    "edges: expected no value when using legacy NetworkX constructor, found: {edges!r}"
                )

        # legacy NetworkX construction
        if graph is not None:
            # FIXME(#717): this should have a deprecation warning, once the tests and examples have
            # stopped using it
            if nodes is not None or edges is not None:
                raise ValueError(
                    "graph: expected no value when using 'nodes' and 'edges' parameters, found: {graph!r}"
                )

            nodes, edges = convert.from_networkx(
                graph,
                node_type_attr=node_type_name,
                edge_type_attr=edge_type_name,
                node_type_default=node_type_default,
                edge_type_default=edge_type_default,
                edge_weight_attr=edge_weight_column,
                node_features=node_features,
                dtype=dtype,
            )

        if nodes is None:
            nodes = {}
        if edges is None:
            edges = {}

        self._is_directed = is_directed
        self._nodes = convert.convert_nodes(
            nodes, name="nodes", default_type=node_type_default, dtype=dtype,
        )
        self._edges = convert.convert_edges(
            edges,
            name="edges",
            default_type=edge_type_default,
            source_column=source_column,
            target_column=target_column,
            weight_column=edge_weight_column,
        )

[docs]    @staticmethod
    def from_networkx(
        graph,
        *,
        edge_weight_attr="weight",
        node_type_attr=globalvar.TYPE_ATTR_NAME,
        edge_type_attr=globalvar.TYPE_ATTR_NAME,
        node_type_default=globalvar.NODE_TYPE_DEFAULT,
        edge_type_default=globalvar.EDGE_TYPE_DEFAULT,
        node_features=None,
        dtype="float32",
        node_type_name=None,
        edge_type_name=None,
        edge_weight_label=None,
    ):
        """
        Construct a ``StellarGraph`` object from a NetworkX graph::

            Gs = StellarGraph.from_networkx(nx_graph)

        To create a StellarGraph object with node features, supply the features
        as a numeric feature vector for each node.

        To take the feature vectors from a node attribute in the original NetworkX
        graph, supply the attribute name to the ``node_features`` argument::

            Gs = StellarGraph.from_networkx(nx_graph, node_features="feature")


        where the nx_graph contains nodes that have a "feature" attribute containing
        the feature vector for the node. All nodes of the same type must have
        the same size feature vectors.

        Alternatively, supply the node features as Pandas DataFrame objects with
        the index of the DataFrame set to the node IDs. For graphs with a single node
        type, you can supply the DataFrame object directly to StellarGraph::

            node_data = pd.DataFrame(
                [feature_vector_1, feature_vector_2, ..],
                index=[node_id_1, node_id_2, ...])
            Gs = StellarGraph.from_networkx(nx_graph, node_features=node_data)

        For graphs with multiple node types, provide the node features as Pandas
        DataFrames for each type separately, as a dictionary by node type.
        This allows node features to have different sizes for each node type::

            node_data = {
                node_type_1: pd.DataFrame(...),
                node_type_2: pd.DataFrame(...),
            }
            Gs = StellarGraph.from_networkx(nx_graph, node_features=node_data)


        You can also supply the node feature vectors as an iterator of `node_id`
        and feature vector pairs, for graphs with single and multiple node types::

            node_data = zip([node_id_1, node_id_2, ...],
                [feature_vector_1, feature_vector_2, ..])
            Gs = StellarGraph.from_networkx(nx_graph, node_features=node_data)


        Args:
            graph: The NetworkX graph instance.
            node_type_attr (str, optional):
                This is the name for the node types that StellarGraph uses
                when processing heterogeneous graphs. StellarGraph will
                look for this attribute in the nodes of the graph to determine
                their type.

            node_type_default (str, optional):
                This is the default node type to use for nodes that do not have
                an explicit type.

            edge_type_attr (str, optional):
                This is the name for the edge types that StellarGraph uses
                when processing heterogeneous graphs. StellarGraph will
                look for this attribute in the edges of the graph to determine
                their type.

            edge_type_default (str, optional):
                This is the default edge type to use for edges that do not have
                an explicit type.

            node_features (str, dict, list or DataFrame optional):
                This tells StellarGraph where to find the node feature information
                required by some graph models. These are expected to be
                a numeric feature vector for each node in the graph.

            edge_weight_attr (str, optional):
                The name of the attribute to use as the weight of edges.

            node_type_name: Deprecated, use ``node_type_attr``.
            edge_type_name: Deprecated, use ``edge_type_attr``.
            edge_weight_label: Deprecated, use ``edge_weight_attr``.

        Returns:
            A ``StellarGraph`` (if ``graph`` is undirected) or ``StellarDiGraph`` (if ``graph`` is
            directed) instance representing the data in ``graph`` and ``node_features``.
        """
        if node_type_name is not None:
            warnings.warn(
                "the 'node_type_name' parameter has been replaced by 'node_type_attr'",
                DeprecationWarning,
            )
            node_type_attr = node_type_name
        if edge_type_name is not None:
            warnings.warn(
                "the 'edge_type_name' parameter has been replaced by 'edge_type_attr'",
                DeprecationWarning,
            )
            edge_type_attr = edge_type_name
        if edge_weight_label is not None:
            warnings.warn(
                "the 'edge_weight_label' parameter has been replaced by 'edge_weight_attr'",
                DeprecationWarning,
            )
            edge_weight_attr = edge_weight_label

        nodes, edges = convert.from_networkx(
            graph,
            node_type_attr=node_type_attr,
            edge_type_attr=edge_type_attr,
            node_type_default=node_type_default,
            edge_type_default=edge_type_default,
            edge_weight_attr=edge_weight_attr,
            node_features=node_features,
            dtype=dtype,
        )

        cls = StellarDiGraph if graph.is_directed() else StellarGraph
        return cls(
            nodes=nodes, edges=edges, edge_weight_column=edge_weight_attr, dtype=dtype
        )

    # customise how a missing attribute is handled to give better error messages for the NetworkX
    # -> no NetworkX transition.
    def __getattr__(self, item):
        import networkx

        try:
            # do the normal access, in case the attribute actually exists, and to get the native
            # python wording of the error
            return super().__getattribute__(item)
        except AttributeError as e:
            if hasattr(networkx.MultiDiGraph, item):
                # a networkx class has this as an attribute, so let's assume that it's old code
                # from before the conversion and replace (the `from None`) the default exception
                # with one with a more specific message that guides the user to the fix
                type_name = type(self).__name__
                raise AttributeError(
                    f"{e.args[0]}. The '{type_name}' type no longer inherits from NetworkX types: use a new StellarGraph method, or, if that is not possible, the `.to_networkx()` conversion function."
                ) from None

            # doesn't look like a NetworkX method so use the default error
            raise

[docs]    def is_directed(self) -> bool:
        """
        Indicates whether the graph is directed (True) or undirected (False).

        Returns:
             bool: The graph directedness status.
        """
        return self._is_directed

[docs]    def number_of_nodes(self) -> int:
        """
        Obtains the number of nodes in the graph.

        Returns:
             int: The number of nodes.
        """
        return len(self._nodes)

[docs]    def number_of_edges(self) -> int:
        """
        Obtains the number of edges in the graph.

        Returns:
             int: The number of edges.
        """
        return len(self._edges)

[docs]    def nodes(self) -> Iterable[Any]:
        """
        Obtains the collection of nodes in the graph.

        Returns:
            The graph nodes.
        """
        return self._nodes.ids.pandas_index

[docs]    def edges(
        self, include_edge_type=False, include_edge_weight=False
    ) -> Iterable[Any]:
        """
        Obtains the collection of edges in the graph.

        Args:
            include_edge_type (bool): A flag that indicates whether to return edge types
            of format (node 1, node 2, edge type) or edge pairs of format (node 1, node 2).
            include_edge_weight (bool): A flag that indicates whether to return edge weights.
            Weights are returned in a separate list.

        Returns:
            The graph edges. If edge weights are included then a tuple of (edges, weights)
        """
        # FIXME: these would be better returned as the 2 or 3 arrays directly, rather than tuple-ing
        # (the same applies to all other instances of zip in this file)
        if include_edge_type:
            edges = list(
                zip(
                    self._edges.sources,
                    self._edges.targets,
                    self._edges.type_of_iloc(slice(None)),
                )
            )
        else:
            edges = list(zip(self._edges.sources, self._edges.targets))

        if include_edge_weight:
            return edges, self._edges.weights

        return edges

[docs]    def has_node(self, node: Any) -> bool:
        """
        Indicates whether or not the graph contains the specified node.

        Args:
            node (any): The node.

        Returns:
             bool: A value of True (cf False) if the node is
             (cf is not) in the graph.
        """
        return node in self._nodes

    def _transform_edges(
        self, other_node_id, ilocs, include_edge_weight, filter_edge_types
    ):
        if include_edge_weight:
            weights = self._edges.weights[ilocs]
        else:
            weights = None

        if filter_edge_types is not None:
            filter_edge_type_ilocs = self._edges.types.to_iloc(filter_edge_types)
            edge_type_ilocs = self._edges.type_ilocs[ilocs]
            correct_type = np.isin(edge_type_ilocs, filter_edge_type_ilocs)

            other_node_id = other_node_id[correct_type]
            if weights is not None:
                weights = weights[correct_type]

        # FIXME(#718): it would be better to return these as ndarrays, instead of (zipped) lists
        if weights is not None:
            return [
                NeighbourWithWeight(node, weight)
                for node, weight in zip(other_node_id, weights)
            ]

        return list(other_node_id)

[docs]    def neighbors(
        self, node: Any, include_edge_weight=False, edge_types=None
    ) -> Iterable[Any]:
        """
        Obtains the collection of neighbouring nodes connected
        to the given node.

        Args:
            node (any): The node in question.
            include_edge_weight (bool, default False): If True, each neighbour in the
                output is a named tuple with fields `node` (the node ID) and `weight` (the edge weight)
            edge_types (list of hashable, optional): If provided, only traverse the graph
                via the provided edge types when collecting neighbours.

        Returns:
            iterable: The neighbouring nodes.
        """
        ilocs = self._edges.edge_ilocs(node, ins=True, outs=True)
        source = self._edges.sources[ilocs]
        target = self._edges.targets[ilocs]
        other_node_id = np.where(source == node, target, source)
        return self._transform_edges(
            other_node_id, ilocs, include_edge_weight, edge_types
        )

[docs]    def in_nodes(
        self, node: Any, include_edge_weight=False, edge_types=None
    ) -> Iterable[Any]:
        """
        Obtains the collection of neighbouring nodes with edges
        directed to the given node. For an undirected graph,
        neighbours are treated as both in-nodes and out-nodes.

        Args:
            node (any): The node in question.
            include_edge_weight (bool, default False): If True, each neighbour in the
                output is a named tuple with fields `node` (the node ID) and `weight` (the edge weight)
            edge_types (list of hashable, optional): If provided, only traverse the graph
                via the provided edge types when collecting neighbours.

        Returns:
            iterable: The neighbouring in-nodes.
        """
        if not self.is_directed():
            # all edges are both incoming and outgoing for undirected graphs
            return self.neighbors(
                node, include_edge_weight=include_edge_weight, edge_types=edge_types
            )

        ilocs = self._edges.edge_ilocs(node, ins=True, outs=False)
        source = self._edges.sources[ilocs]
        return self._transform_edges(source, ilocs, include_edge_weight, edge_types)

[docs]    def out_nodes(
        self, node: Any, include_edge_weight=False, edge_types=None
    ) -> Iterable[Any]:
        """
        Obtains the collection of neighbouring nodes with edges
        directed from the given node. For an undirected graph,
        neighbours are treated as both in-nodes and out-nodes.

        Args:
            node (any): The node in question.
            include_edge_weight (bool, default False): If True, each neighbour in the
                output is a named tuple with fields `node` (the node ID) and `weight` (the edge weight)
            edge_types (list of hashable, optional): If provided, only traverse the graph
                via the provided edge types when collecting neighbours.

        Returns:
            iterable: The neighbouring out-nodes.
        """
        if not self.is_directed():
            # all edges are both incoming and outgoing for undirected graphs
            return self.neighbors(
                node, include_edge_weight=include_edge_weight, edge_types=edge_types
            )

        ilocs = self._edges.edge_ilocs(node, ins=False, outs=True)
        target = self._edges.targets[ilocs]
        return self._transform_edges(target, ilocs, include_edge_weight, edge_types)

[docs]    def nodes_of_type(self, node_type=None):
        """
        Get the nodes of the graph with the specified node types.

        Args:
            node_type (hashable, optional): a type of nodes that exist in the graph

        Returns:
            A list of node IDs with type node_type
        """
        if node_type is None:
            return self.nodes()

        ilocs = self._nodes.type_range(node_type)
        return list(self._nodes.ids.from_iloc(ilocs))

[docs]    def node_type(self, node):
        """
        Get the type of the node

        Args:
            node: Node ID

        Returns:
            Node type
        """
        nodes = [node]
        node_ilocs = self._nodes.ids.to_iloc(nodes, strict=True)
        type_sequence = self._nodes.type_of_iloc(node_ilocs)

        assert len(type_sequence) == 1
        return type_sequence[0]

    @property
    def node_types(self):
        """
        Get a list of all node types in the graph.

        Returns:
            set of types
        """
        return set(self._nodes.types.pandas_index)

[docs]    def node_feature_sizes(self, node_types=None):
        """
        Get the feature sizes for the specified node types.

        Args:
            node_types (list, optional): A list of node types. If None all current node types
                will be used.

        Returns:
            A dictionary of node type and integer feature size.
        """
        all_sizes = self._nodes.feature_sizes()
        if node_types is None:
            return all_sizes

        node_types = set(node_types)
        return {
            type_name: size for type_name, size in all_sizes if type_name in node_types
        }

[docs]    def check_graph_for_ml(self, features=True):
        """
        Checks if all properties required for machine learning training/inference are set up.
        An error will be raised if the graph is not correctly setup.
        """
        if all(size == 0 for _, size in self.node_feature_sizes().items()):
            raise RuntimeError(
                "This StellarGraph has no numeric feature attributes for nodes"
                "Node features are required for machine learning"
            )

        # TODO: check the schema

        # TODO: check the feature node_ids against the graph node ids?

[docs]    def node_features(self, nodes, node_type=None):
        """
        Get the numeric feature vectors for the specified node or nodes.
        If the node type is not specified the node types will be found
        for all nodes. It is therefore important to supply the ``node_type``
        for this method to be fast.

        Args:
            nodes (list or hashable): Node ID or list of node IDs
            node_type (hashable): the type of the nodes.

        Returns:
            Numpy array containing the node features for the requested nodes.
        """
        nodes = np.asarray(nodes)

        node_ilocs = self._nodes.ids.to_iloc(nodes)
        valid = self._nodes.ids.is_valid(node_ilocs)
        all_valid = valid.all()
        valid_ilocs = node_ilocs if all_valid else node_ilocs[valid]

        if node_type is None:
            # infer the type based on the valid nodes
            types = np.unique(self._nodes.type_of_iloc(valid_ilocs))

            if len(types) == 0:
                raise ValueError(
                    "must have at least one node for inference, if `node_type` is not specified"
                )
            if len(types) > 1:
                raise ValueError("all nodes must have the same type")

            node_type = types[0]

        if all_valid:
            return self._nodes.features(node_type, valid_ilocs)

        # If there's some invalid values, they get replaced by zeros; this is designed to allow
        # models that build fixed-size structures (e.g. GraphSAGE) based on neighbours to fill out
        # missing neighbours with zeros automatically, using None as a sentinel.

        # FIXME: None as a sentinel forces nodes to have dtype=object even with integer IDs, could
        # instead use an impossible integer (e.g. 2**64 - 1)

        # everything that's not the sentinel should be valid
        non_nones = nodes != None
        self._nodes.ids.require_valid(nodes[non_nones], node_ilocs[non_nones])

        sampled = self._nodes.features(node_type, valid_ilocs)
        features = np.zeros((len(nodes), sampled.shape[1]))
        features[valid] = sampled

        return features

    ##################################################################
    # Computationally intensive methods:

    def _edge_type_iloc_triples(self, selector=slice(None), stacked=False):
        source_ilocs = self._nodes.ids.to_iloc(self._edges.sources[selector])
        source_type_ilocs = self._nodes.type_ilocs[source_ilocs]

        rel_type_ilocs = self._edges.type_ilocs[selector]

        target_ilocs = self._nodes.ids.to_iloc(self._edges.targets[selector])
        target_type_ilocs = self._nodes.type_ilocs[target_ilocs]

        all_ilocs = source_type_ilocs, rel_type_ilocs, target_type_ilocs
        if stacked:
            return np.stack(all_ilocs, axis=-1)

        return all_ilocs

    def _edge_type_triples(self, selector=slice(None)):
        src_ilocs, rel_ilocs, tgt_ilocs = self._edge_type_iloc_triples(
            selector, stacked=False
        )

        return (
            self._nodes.types.from_iloc(src_ilocs),
            self._edges.types.from_iloc(rel_ilocs),
            self._nodes.types.from_iloc(tgt_ilocs),
        )

    def _unique_type_triples(self, *, return_counts, selector=slice(None)):
        all_type_ilocs = self._edge_type_iloc_triples(selector, stacked=True)

        if len(all_type_ilocs) == 0:
            # FIXME(https://github.com/numpy/numpy/issues/15559): if there's no edges, np.unique is
            # being called on a shape=(0, 3) ndarray, and hits "ValueError: cannot reshape array of
            # size 0 into shape (0,newaxis)", so we manually reproduce what would be returned
            if return_counts:
                ret = None, [], []
            else:
                ret = None, []
        else:
            ret = np.unique(
                all_type_ilocs, axis=0, return_index=True, return_counts=return_counts
            )

        edge_ilocs = ret[1]
        # we've now got the indices for an edge with each triple, along with the counts of them, so
        # we can query to get the actual edge types (this is, at the time of writing, easier than
        # getting the actual type for each type iloc in the triples)
        unique_ets = self._edge_type_triples(edge_ilocs)

        if return_counts:
            return zip(*unique_ets, ret[2])

        return zip(*unique_ets)

[docs]    def info(self, show_attributes=True, sample=None):
        """
        Return an information string summarizing information on the current graph.
        This includes node and edge type information and their attributes.

        Note: This requires processing all nodes and edges and could take a long
        time for a large graph.

        Args:
            show_attributes (bool, default True): If True, include attributes information
            sample (int): To speed up the graph analysis, use only a random sample of
                          this many nodes and edges.

        Returns:
            An information string.
        """
        directed_str = "Directed" if self.is_directed() else "Undirected"
        lines = [
            f"{type(self).__name__}: {directed_str} multigraph",
            f" Nodes: {self.number_of_nodes()}, Edges: {self.number_of_edges()}",
        ]

        # Numpy processing is much faster than NetworkX processing, so we don't bother sampling.
        gs = self.create_graph_schema()

        def str_edge_type(et):
            n1, rel, n2 = et
            return f"{n1}-{rel}->{n2}"

        lines.append("")
        lines.append(" Node types:")

        for nt in gs.node_types:
            nodes = self.nodes_of_type(nt)
            lines.append(f"  {nt}: [{len(nodes)}]")
            edge_types = ", ".join(str_edge_type(et) for et in gs.schema[nt])
            lines.append(f"    Edge types: {edge_types}")

        lines.append("")
        lines.append(" Edge types:")

        # FIXME: it would be better for the schema to just include the counts directly
        for src_ty, rel_ty, tgt_ty, count in self._unique_type_triples(
            return_counts=True
        ):
            et = EdgeType(src_ty, rel_ty, tgt_ty)
            lines.append(f"    {str_edge_type(et)}: [{count}]")

        return "\n".join(lines)

[docs]    def create_graph_schema(self, nodes=None):
        """
        Create graph schema in dict of dict format from current graph.

        Note the assumption we make that there is only one
        edge of a particular edge type per node pair.

        This means that specifying an edge by node0, node1 and edge type
        is unique.

        Arguments:
            nodes (list): A list of node IDs to use to build schema. This must
                represent all node types and all edge types in the graph.
                If not specified, all nodes and edges in the graph are used.

        Returns:
            GraphSchema object.
        """

        graph_schema = {nt: set() for nt in self.node_types}
        edge_types = set()

        if nodes is None:
            selector = slice(None)
        else:
            selector = np.isin(self._edges.sources, nodes) & np.isin(
                self._edges.targets, nodes
            )

        for n1, rel, n2 in self._unique_type_triples(
            selector=selector, return_counts=False
        ):
            edge_type_tri = EdgeType(n1, rel, n2)
            edge_types.add(edge_type_tri)
            graph_schema[n1].add(edge_type_tri)

            if not self.is_directed():
                edge_type_tri = EdgeType(n2, rel, n1)
                edge_types.add(edge_type_tri)
                graph_schema[n2].add(edge_type_tri)

        # Create ordered list of edge_types
        edge_types = sorted(edge_types)

        # Create keys for node and edge types
        schema = {
            node_label: sorted(node_data)
            for node_label, node_data in graph_schema.items()
        }

        return GraphSchema(
            self.is_directed(), sorted(self.node_types), edge_types, schema
        )

[docs]    def node_degrees(self) -> Mapping[Any, int]:
        """
        Obtains a map from node to node degree.

        Returns:
            The degree of each node.
        """
        return self._edges.degrees()

[docs]    def to_adjacency_matrix(self, nodes: Optional[Iterable] = None, weighted=False):
        """
        Obtains a SciPy sparse adjacency matrix of edge weights.

        By default (``weighted=False``), each element of the matrix contains the number
        of edges between the two vertices (only 0 or 1 in a graph without multi-edges).

        Args:
            nodes (iterable): The optional collection of nodes
                comprising the subgraph. If specified, then the
                adjacency matrix is computed for the subgraph;
                otherwise, it is computed for the full graph.
            weighted (bool): If true, use the edge weight column from the graph instead
                of edge counts (weights from multi-edges are summed).

        Returns:
             The weighted adjacency matrix.
        """
        if nodes is None:
            index = self._nodes._id_index
            selector = slice(None)
        else:
            nodes = list(nodes)
            index = ExternalIdIndex(nodes)
            selector = np.isin(self._edges.sources, nodes) & np.isin(
                self._edges.targets, nodes
            )

        # these indices are computed relative to the index above. If `nodes` is None, they'll be the
        # overall ilocs (for the original graph), otherwise they'll be the indices of the `nodes`
        # list.
        src_idx = index.to_iloc(self._edges.sources[selector])
        tgt_idx = index.to_iloc(self._edges.targets[selector])
        if weighted:
            weights = self._edges.weights[selector]
        else:
            weights = np.ones(src_idx.shape, dtype=self._edges.weights.dtype)

        n = len(index)

        adj = sps.csr_matrix((weights, (src_idx, tgt_idx)), shape=(n, n))
        if not self.is_directed():
            # in an undirected graph, the adjacency matrix should be symmetric: which means counting
            # weights from either "incoming" or "outgoing" edges, but not double-counting self loops
            backward = sps.csr_matrix((weights, (tgt_idx, src_idx)), shape=(n, n))
            backward.setdiag(0)
            adj += backward

        # this is a multigraph, let's eliminate any duplicate entries
        adj.sum_duplicates()
        return adj

[docs]    def to_networkx(
        self,
        node_type_name=globalvar.TYPE_ATTR_NAME,
        edge_type_name=globalvar.TYPE_ATTR_NAME,
        edge_weight_label=globalvar.WEIGHT,
        feature_name=globalvar.FEATURE_ATTR_NAME,
    ):
        """
        Create a NetworkX MultiGraph or MultiDiGraph instance representing this graph.

        Args:
            node_type_name (str): the name of the attribute to use to store a node's type (or label).

            edge_type_name (str): the name of the attribute to use to store a edge's type (or label).

            edge_weight_label (str): the name of the attribute to use to store a edge's weight.

            feature_name (str, optional): the name of the attribute to use to store a node's feature
                vector; if ``None``, feature vectors are not stored within each node.

        Returns:
             An instance of `networkx.MultiDiGraph` (if directed) or `networkx.MultiGraph` (if
             undirected) containing all the nodes & edges and their types & features in this graph.
        """
        import networkx

        if self.is_directed():
            graph = networkx.MultiDiGraph()
        else:
            graph = networkx.MultiGraph()

        for ty in self.node_types:
            node_ids = self.nodes_of_type(ty)
            ty_dict = {node_type_name: ty}

            if feature_name is not None:
                features = self.node_features(node_ids, node_type=ty)

                for node_id, node_features in zip(node_ids, features):
                    graph.add_node(
                        node_id, **ty_dict, **{feature_name: node_features},
                    )
            else:
                graph.add_nodes_from(node_ids, **ty_dict)

        iterator = zip(
            self._edges.sources,
            self._edges.targets,
            self._edges.type_of_iloc(slice(None)),
            self._edges.weights,
        )
        graph.add_edges_from(
            (src, dst, {edge_type_name: type_, edge_weight_label: weight})
            for src, dst, type_, weight in iterator
        )

        return graph

    # FIXME: Experimental/special-case methods that need to be considered more; the underscores
    # denote "package private", not fully private, and so are ok to use in the rest of stellargraph
    def _get_index_for_nodes(self, nodes, node_type=None):
        """
        Get the indices for the specified node or nodes.
        If the node type is not specified the node types will be found
        for all nodes. It is therefore important to supply the ``node_type``
        for this method to be fast.

        Args:
            n: (list or hashable) Node ID or list of node IDs
            node_type: (hashable) the type of the nodes.

        Returns:
            Numpy array containing the indices for the requested nodes.
        """
        return self._nodes._id_index.to_iloc(nodes, strict=True)

    def _adjacency_types(self, graph_schema: GraphSchema):
        """
        Obtains the edges in the form of the typed mapping:

            {edge_type_triple: {source_node: [target_node, ...]}}

        Args:
            graph_schema: The graph schema.
        Returns:
             The edge types mapping.
        """
        source_types, rel_types, target_types = self._edge_type_triples(slice(None))

        triples = defaultdict(lambda: defaultdict(lambda: []))

        iterator = zip(
            source_types,
            rel_types,
            target_types,
            self._edges.sources,
            self._edges.targets,
        )
        for src_type, rel_type, tgt_type, src, tgt in iterator:
            triple = EdgeType(src_type, rel_type, tgt_type)
            triples[triple][src].append(tgt)

            if not self.is_directed() and src != tgt:
                other_triple = EdgeType(tgt_type, rel_type, src_type)
                triples[other_triple][tgt].append(src)

        for subdict in triples.values():
            for v in subdict.values():
                # each list should be in order, to ensure sampling methods are deterministic
                v.sort(key=str)

        return triples

    def _edge_weights(self, source_node: Any, target_node: Any) -> List[Any]:
        """
        Obtains the weights of edges between the given pair of nodes.

        Args:
            source_node (any): The source node.
            target_node (any): The target node.

        Returns:
            list: The edge weights.
        """
        # self loops should only be counted once, which means they're effectively always a directed
        # edge at the storage level, unlikely other edges in an undirected graph. This is
        # particularly important with the intersection1d call, where the source_ilocs and
        # target_ilocs will be equal, when source_node == target_node, and thus the intersection
        # will contain all incident edges.
        effectively_directed = self.is_directed() or source_node == target_node
        both_dirs = not effectively_directed

        source_ilocs = self._edges.edge_ilocs(source_node, ins=both_dirs, outs=True)
        target_ilocs = self._edges.edge_ilocs(target_node, ins=True, outs=both_dirs)

        ilocs = np.intersect1d(source_ilocs, target_ilocs, assume_unique=True)

        return [float(x) for x in self._edges.weights[ilocs]]


# A convenience class that merely specifies that edges have direction.
class StellarDiGraph(StellarGraph):
    def __init__(
        self,
        nodes=None,
        edges=None,
        *,
        source_column=globalvar.SOURCE,
        target_column=globalvar.TARGET,
        edge_weight_column=globalvar.WEIGHT,
        node_type_default=globalvar.NODE_TYPE_DEFAULT,
        edge_type_default=globalvar.EDGE_TYPE_DEFAULT,
        dtype="float32",
        # legacy arguments
        graph=None,
        node_type_name=globalvar.TYPE_ATTR_NAME,
        edge_type_name=globalvar.TYPE_ATTR_NAME,
        edge_weight_label=None,
        node_features=None,
    ):
        super().__init__(
            nodes=nodes,
            edges=edges,
            is_directed=True,
            source_column=source_column,
            target_column=target_column,
            edge_weight_column=edge_weight_column,
            node_type_default=node_type_default,
            edge_type_default=edge_type_default,
            dtype=dtype,
            # legacy arguments
            graph=graph,
            node_type_name=node_type_name,
            edge_type_name=edge_type_name,
            edge_weight_label=edge_weight_label,
            node_features=node_features,
        )