Source code for stellargraph.core.graph

# -*- coding: utf-8 -*-
#
# Copyright 2017-2020 Data61, CSIRO
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
The StellarGraph class that encapsulates information required for
a machine-learning ready graph used by models.

"""
__all__ = ["StellarGraph", "StellarDiGraph", "GraphSchema", "NeighbourWithWeight"]

from typing import Iterable, Any, Mapping, List, Optional, Set
from collections import defaultdict, namedtuple
import pandas as pd
import numpy as np
import scipy.sparse as sps
import warnings

from .. import globalvar
from .schema import GraphSchema, EdgeType
from .experimental import experimental, ExperimentalWarning
from .element_data import NodeData, EdgeData, ExternalIdIndex
from .utils import is_real_iterable
from .validation import comma_sep, separated
from . import convert


NeighbourWithWeight = namedtuple("NeighbourWithWeight", ["node", "weight"])


def extract_element_features(element_data, unique, name, ids, type, use_ilocs):
    if ids is None:
        if type is None:
            type = unique(
                f"{name}_type: in a non-homogeneous graph, expected a {name} type and/or '{name}s' to be passed; found neither '{name}_type' nor '{name}s', and the graph has {name} types: %(found)s"
            )

        return element_data.features_of_type(type)

    ids = np.asarray(ids)

    if len(ids) == 0:
        # empty lists are cast to a default array type of float64 -
        # must manually specify integer type if empty, in which case we can pretend we received ilocs
        ilocs = ids.astype(dtype=np.uint8)
        use_ilocs = True
    elif use_ilocs:
        ilocs = ids
    else:
        ilocs = element_data.ids.to_iloc(ids)

    valid = element_data.ids.is_valid(ilocs)
    all_valid = valid.all()
    valid_ilocs = ilocs if all_valid else ilocs[valid]

    if type is None:
        try:
            # no inference required in a homogeneous-node graph
            type = unique()
        except ValueError:
            # infer the type based on the valid nodes
            types = np.unique(element_data.type_of_iloc(valid_ilocs))

            if len(types) == 0:
                raise ValueError(
                    f"must have at least one node for inference, if `{name}_type` is not specified"
                )
            if len(types) > 1:
                raise ValueError(f"all {name}s must have the same type")

            type = types[0]

    if all_valid:
        return element_data.features(type, valid_ilocs)

    # If there's some invalid values, they get replaced by zeros; this is designed to allow
    # models that build fixed-size structures (e.g. GraphSAGE) based on neighbours to fill out
    # missing neighbours with zeros automatically, using None as a sentinel.

    # FIXME: None as a sentinel forces nodes to have dtype=object even with integer IDs, could
    # instead use an impossible integer (e.g. 2**64 - 1)

    # everything that's not the sentinel should be valid
    if not use_ilocs:
        non_nones = ids != None
        element_data.ids.require_valid(ids[non_nones], ilocs[non_nones])

    sampled = element_data.features(type, valid_ilocs)
    features = np.zeros((len(ids), sampled.shape[1]))
    features[valid] = sampled

    return features


[docs]class StellarGraph: """ StellarGraph class for graph machine learning. Summary of a StellarGraph and the terminology used: - it stores graph structure, as a collection of *nodes* and a collection of *edges* that connect a *source* node to a *target* node - each node and edge has an associated *type* - each node and edge has a numeric vector of *features*, and the vectors of all nodes or edges with the same type have the same dimension - it is *homogeneous* if there is only one type of node and one type of edge - it is *heterogeneous* if it is not homogeneous (more than one type of node, or more than one type of edge) - it is *directed* if the direction of an edge starting at its source node and finishing at its target node is important - it is *undirected* if the direction does not matter - every StellarGraph can be a *multigraph*, meaning there can be multiple edges between any two nodes To create a StellarGraph object, at a minimum pass the edges as a Pandas DataFrame. Each row of the edges DataFrame represents an edge, where the index is the ID of the edge, and the ``source`` and ``target`` columns store the node ID of the source and target nodes. For example, suppose we're modelling a graph that's a square with a diagonal:: a -- b | \\ | | \\ | d -- c The DataFrame might look like:: edges = pd.DataFrame( {"source": ["a", "b", "c", "d", "a"], "target": ["b", "c", "d", "a", "c"]} ) If this data represents an undirected graph (the ordering of each edge source/target doesn't matter):: Gs = StellarGraph(edges=edges) If this data represents a directed graph (the ordering does matter):: Gs = StellarDiGraph(edges=edges) One can also pass information about nodes, as either: - a :class:`.IndexedArray` - a NumPy array, if the node IDs are 0, 1, 2, ... - a Pandas DataFrame Each row of the nodes frame (first dimension of the NumPy array) represents a node in the graph, where the index is the ID of the node. When this node information is not passed (the argument is left as the default), the set of nodes is automatically inferred. This inference in the example above is equivalent to:: nodes = IndexedArray(index=["a", "b", "c", "d"]) Gs = StellarGraph(nodes, edges) Numeric node features are taken as any columns of the nodes DataFrame. For example, if the graph above has two features ``x`` and ``y`` associated with each node:: # As a IndexedArray (no column names): feature_array = np.array([[-1, 0.4], [2, 0.1], [-3, 0.9], [4, 0]]) nodes = IndexedArray(feature_array, index=["a", "b", "c", "d"]) # As a Pandas DataFrame: nodes = pd.DataFrame( {"x": [-1, 2, -3, 4], "y": [0.4, 0.1, 0.9, 0]}, index=["a", "b", "c", "d"] ) # As a NumPy array: # Note, edges must change to using 0, 1, 2, 3 (instead of a, b, c, d) nodes = feature_array Construction directly from a :class:`.IndexedArray` or NumPy array will have the least overhead, but construction from Pandas allows for convenient data transformation. Edge weights are taken as the optional ``weight`` column of the edges DataFrame:: edges = pd.DataFrame({ "source": ["a", "b", "c", "d", "a"], "target": ["b", "c", "d", "a", "c"], "weight": [10, 0.5, 1, 3, 13] }) Numeric edge features are taken by any columns that do not have a special meaning (that is, excluding ``source``, ``target`` and the optional ``weight`` or ``edge_type_column`` columns). For example, if the graph has weighted edges with two features ``a`` and ``b`` associated with each node:: edges = pd.DataFrame({ "source": ["a", "b", "c", "d", "a"], "target": ["b", "c", "d", "a", "c"], "weight": [10, 0.5, 1, 3, 13], "a": [-1, 2, -3, 4, -5], "b": [0.4, 0.1, 0.9, 0, 0.9], }) Heterogeneous graphs, with multiple node or edge types, can be created by passing multiple :class:`.IndexedArray` or DataFrames in a dictionary. The dictionary keys are the names/identifiers for the type. For example, if the graph above has node ``a`` of type ``foo``, and the rest as type ``bar``, the construction might look like:: foo_nodes = IndexedArray(np.array([[-1]]), index=["a"]) bar_nodes = IndexedArray( np.array([[0.4, 100], [0.1, 200], [0.9, 300]], index=["b", "c", "d"]) ) StellarGraph({"foo": foo_nodes, "bar": bar_nodes}, edges) (One cannot pass multiple NumPy arrays, because the node IDs cannot be inferred properly in this case. The node IDs for a NumPy array can be specified via the :class:`.IndexedArray` type.) Notice the ``foo`` node has one feature ``x``, while the ``bar`` nodes have 2 features ``y`` and ``z``. A heterogeneous graph can have different features for each type. Edges of different types can work in the same way. For instance, if edges have different types based on their orientation:: horizontal_edges = pd.DataFrame( {"source": ["a", "c"], "target": ["b", "d"]}, index=[0, 2] ) vertical_edges = pd.DataFrame( {"source": ["b", "d"], "target": ["c", "a"]}, index=[1, 3] ) diagonal_edges = pd.DataFrame({"source": ["a"], "target": ["c"]}, index=[4]) StellarGraph(nodes, {"h": horizontal_edges, "v": vertical_edges, "d": diagonal_edges}) A dictionary can be passed for both arguments:: StellarGraph( {"foo": foo_nodes, "bar": bar_nodes}, {"h": horizontal_edges, "v": vertical_edges, "d": diagonal_edges} ) Alternatively, a single DataFrame can be provided, with an additional column of the type. This column is specified by passing the ``edge_type_column`` argument:: orientation_edges = pd.DataFrame( { "source": ["a", "b", "c", "d", "a"], "target": ["b", "c", "d", "a", "c"], "type": ["h", "v", "h", "v", "d"] } ) StellarGraph(nodes, orientation_edges, edge_type_column="type") .. note:: The IDs of nodes must be unique across all types: for example, it is an error to have a node 0 of type ``a``, and a node 0 of type ``b``. IDs of edges must also be unique across all types. .. _iloc-explanation: This type stores the external IDs for nodes and edges as :term:`ilocs <iloc>`. For convenience, methods here will traffic in the external ID values and transparently convert to and from ilocs as required internally. Many of these methods also have a ``use_ilocs`` parameter that allows for explicitly switching the methods to consume and return ilocs directly, cutting out the conversion overhead. .. seealso:: The :meth:`from_networkx` allows constructing from a NetworkX graph. `The examples of loading data <https://stellargraph.readthedocs.io/en/stable/demos/basics/index.html>`__ into a :class:`.StellarGraph` from many formats. Args: nodes (Numpy array, IndexedArray, DataFrame or dict of hashable to IndexedArray or Pandas DataFrame, optional): Features for every node in the graph. The values are taken as numeric node features of type ``dtype``. If there is only one type of node, a NumPy array, :class:`.IndexedArray` or DataFrame can be passed directly, and the type defaults to the ``node_type_default`` parameter. Nodes have an ID taken from the index of the dataframe, and they have to be unique across all types. For nodes with no features, an appropriate value can be created with ``IndexedArray(index=node_ids)``, where ``node_ids`` is a list of the node IDs. If this is not passed, the nodes will be inferred from ``edges`` with no features for each node. edges (DataFrame or dict of hashable to Pandas DataFrame, optional): An edge list for each type of edges as a Pandas DataFrame containing a source, target and (optionally) weight column (the names of each are taken from the ``source_column``, ``target_column`` and ``edge_weight_column`` parameters), along with any feature columns. If there is only one type of edges, a DataFrame can be passed directly, and the type defaults to the ``edge_type_default`` parameter. Edges have an ID taken from the index of the dataframe, and they have to be unique across all types. is_directed (bool, optional): If True, the data represents a directed multigraph, otherwise an undirected multigraph. source_column (str, optional): The name of the column to use as the source node of edges in the ``edges`` edge list argument. target_column (str, optional): The name of the column to use as the target node of edges in the ``edges`` edge list argument. edge_weight_column (str, optional): The name of the column in each of the ``edges`` DataFrames to use as the weight of edges. If the column does not exist in any of them, it is defaulted to ``1``. edge_type_column (str, optional): The name of the column in the ``edges`` DataFrame to use as the edge type (if this is set, ``edges`` must be a single DataFrame, not a dictionary). node_type_default (str, optional): The default node type to use, if ``nodes`` is passed as a DataFrame (not a ``dict``). edge_type_default (str, optional): The default edge type to use, if ``edges`` is passed as a DataFrame (not a ``dict``). dtype (numpy data-type, optional): The numpy data-type to use for the features extracted from each of the ``nodes`` DataFrames. graph: Deprecated, use :meth:`from_networkx`. node_type_name: Deprecated, use :meth:`from_networkx`. edge_type_name: Deprecated, use :meth:`from_networkx`. node_features: Deprecated, use :meth:`from_networkx`. """ def __init__( self, nodes=None, edges=None, *, is_directed=False, source_column=globalvar.SOURCE, target_column=globalvar.TARGET, edge_weight_column=globalvar.WEIGHT, edge_type_column=None, node_type_default=globalvar.NODE_TYPE_DEFAULT, edge_type_default=globalvar.EDGE_TYPE_DEFAULT, dtype="float32", # legacy arguments: graph=None, node_type_name=globalvar.TYPE_ATTR_NAME, edge_type_name=globalvar.TYPE_ATTR_NAME, node_features=None, ): import networkx if isinstance(nodes, networkx.Graph): # `StellarGraph(nx_graph)` -> `graph` graph = nodes nodes = None if edges is not None: raise ValueError( "edges: expected no value when using legacy NetworkX constructor, found: {edges!r}" ) # legacy NetworkX construction if graph is not None: # FIXME(#717): this should have a deprecation warning, once the tests and examples have # stopped using it if nodes is not None or edges is not None: raise ValueError( "graph: expected no value when using 'nodes' and 'edges' parameters, found: {graph!r}" ) warnings.warn( "Constructing a StellarGraph directly from a NetworkX graph has been replaced by the `StellarGraph.from_networkx` function", DeprecationWarning, stacklevel=2, ) nodes, edges = convert.from_networkx( graph, node_type_attr=node_type_name, edge_type_attr=edge_type_name, node_type_default=node_type_default, edge_type_default=edge_type_default, edge_weight_attr=edge_weight_column, node_features=node_features, dtype=dtype, ) if nodes is None: nodes_after_inference = self._infer_nodes_from_edges( edges, source_column, target_column ) nodes = pd.DataFrame([], index=nodes_after_inference) if edges is None: edges = {} self._is_directed = is_directed nodes_is_internal = isinstance(nodes, NodeData) edges_is_internal = isinstance(edges, EdgeData) any_internal = nodes_is_internal or edges_is_internal if not any_internal: internal_nodes = convert.convert_nodes( nodes, name="nodes", default_type=node_type_default, dtype=dtype, ) internal_edges = convert.convert_edges( edges, name="edges", default_type=edge_type_default, source_column=source_column, target_column=target_column, weight_column=edge_weight_column, type_column=edge_type_column, nodes=internal_nodes, dtype=dtype, ) else: if not edges_is_internal: raise TypeError( f"edges: expected type 'EdgeData' when 'nodes' has type 'NodeData', found {type(edges).__name__}" ) if not nodes_is_internal: raise TypeError( f"nodes: expected type 'NodeData' when 'edges' has type 'EdgeData', found {type(nodes).__name__}" ) params = locals() for param, expected in self.__init__.__kwdefaults__.items(): if param == "is_directed": continue if params[param] is not expected: raise ValueError( f"{param}: expected the default value ({expected!r}) when constructing from 'NodeData' and 'EdgeData', found {params[param]!r}. (All parameters except 'nodes', 'edges' and 'is_directed' must be left unset.)" ) internal_nodes = nodes internal_edges = edges # FIXME: it would be good to do more validation that 'nodes' and 'edges' match here self._nodes = internal_nodes self._edges = internal_edges @staticmethod def _infer_nodes_from_edges(edges, source_column, target_column): # `convert_edges` nicely flags any errors in edges; inference here is lax rather than duplicate that if isinstance(edges, dict): dataframes = edges.values() else: dataframes = [edges] found_columns = [ type_edges[column] for type_edges in dataframes if isinstance(type_edges, pd.DataFrame) for column in [source_column, target_column] if column in type_edges.columns ] if found_columns: return pd.unique(np.concatenate(found_columns)) return []
[docs] @staticmethod def from_networkx( graph, *, edge_weight_attr="weight", node_type_attr=globalvar.TYPE_ATTR_NAME, edge_type_attr=globalvar.TYPE_ATTR_NAME, node_type_default=globalvar.NODE_TYPE_DEFAULT, edge_type_default=globalvar.EDGE_TYPE_DEFAULT, node_features=None, dtype="float32", ): """ Construct a ``StellarGraph`` object from a NetworkX graph:: Gs = StellarGraph.from_networkx(nx_graph) To create a StellarGraph object with node features, supply the features as a numeric feature vector for each node. To take the feature vectors from a node attribute in the original NetworkX graph, supply the attribute name to the ``node_features`` argument:: Gs = StellarGraph.from_networkx(nx_graph, node_features="feature") where ``nx_graph`` contains nodes that have a ``"feature"`` attribute containing the feature vector for the node. All nodes of the same type must have the same size feature vectors. Alternatively, supply the node features as Pandas DataFrame objects with the index of the DataFrame set to the node IDs. For graphs with a single node type, you can supply the DataFrame object directly to StellarGraph:: node_data = pd.DataFrame( [feature_vector_1, feature_vector_2, ..], index=[node_id_1, node_id_2, ...]) Gs = StellarGraph.from_networkx(nx_graph, node_features=node_data) For graphs with multiple node types, provide the node features as Pandas DataFrames for each type separately, as a dictionary by node type. This allows node features to have different sizes for each node type:: node_data = { node_type_1: pd.DataFrame(...), node_type_2: pd.DataFrame(...), } Gs = StellarGraph.from_networkx(nx_graph, node_features=node_data) The dictionary only needs to include node types with features. If a node type isn't mentioned in the dictionary (for example, if `nx_graph` above has a 3rd node type), each node of that type will have a feature vector of length zero. You can also supply the node feature vectors as an iterator of `node_id` and feature vector pairs, for graphs with single and multiple node types:: node_data = zip([node_id_1, node_id_2, ...], [feature_vector_1, feature_vector_2, ..]) Gs = StellarGraph.from_networkx(nx_graph, node_features=node_data) .. seealso:: `The Loading from NetworkX example <https://stellargraph.readthedocs.io/en/stable/demos/basics/loading-networkx.html>`__ Args: graph: The NetworkX graph instance. node_type_attr (str, optional): This is the name for the node types that StellarGraph uses when processing heterogeneous graphs. StellarGraph will look for this attribute in the nodes of the graph to determine their type. node_type_default (str, optional): This is the default node type to use for nodes that do not have an explicit type. edge_type_attr (str, optional): This is the name for the edge types that StellarGraph uses when processing heterogeneous graphs. StellarGraph will look for this attribute in the edges of the graph to determine their type. edge_type_default (str, optional): This is the default edge type to use for edges that do not have an explicit type. node_features (str, dict, list or DataFrame optional): This tells StellarGraph where to find the node feature information required by some graph models. These are expected to be a numeric feature vector for each node in the graph. edge_weight_attr (str, optional): The name of the attribute to use as the weight of edges. Returns: A ``StellarGraph`` (if ``graph`` is undirected) or ``StellarDiGraph`` (if ``graph`` is directed) instance representing the data in ``graph`` and ``node_features``. """ nodes, edges = convert.from_networkx( graph, node_type_attr=node_type_attr, edge_type_attr=edge_type_attr, node_type_default=node_type_default, edge_type_default=edge_type_default, edge_weight_attr=edge_weight_attr, node_features=node_features, dtype=dtype, ) cls = StellarDiGraph if graph.is_directed() else StellarGraph return cls( nodes=nodes, edges=edges, edge_weight_column=edge_weight_attr, dtype=dtype )
# customise how a missing attribute is handled to give better error messages for the NetworkX # -> no NetworkX transition. def __getattr__(self, item): import networkx try: # do the normal access, in case the attribute actually exists, and to get the native # python wording of the error return super().__getattribute__(item) except AttributeError as e: if hasattr(networkx.MultiDiGraph, item): # a networkx class has this as an attribute, so let's assume that it's old code # from before the conversion and replace (the `from None`) the default exception # with one with a more specific message that guides the user to the fix type_name = type(self).__name__ raise AttributeError( f"{e.args[0]}. The '{type_name}' type no longer inherits from NetworkX types: use a new StellarGraph method, or, if that is not possible, the `.to_networkx()` conversion function." ) from None # doesn't look like a NetworkX method so use the default error raise
[docs] def is_directed(self) -> bool: """ Indicates whether the graph is directed (True) or undirected (False). Returns: bool: The graph directedness status. """ return self._is_directed
[docs] def number_of_nodes(self) -> int: """ Obtains the number of nodes in the graph. Returns: int: The number of nodes. """ return len(self._nodes)
[docs] def number_of_edges(self) -> int: """ Obtains the number of edges in the graph. Returns: int: The number of edges. """ return len(self._edges)
[docs] def nodes(self, node_type=None, use_ilocs=False) -> Iterable[Any]: """ Obtains the collection of nodes in the graph. Args: node_type (hashable, optional): a type of nodes that exist in the graph use_ilocs (bool): if True return :ref:`node ilocs <iloc-explanation>` as a ``range`` object Returns: All the nodes in the graph if ``node_type`` is ``None``, otherwise all the nodes in the graph of type ``node_type``. """ if node_type is None: all_ids = self._nodes.ids.pandas_index if use_ilocs: return range(self.number_of_nodes()) return all_ids ilocs = self._nodes.type_range(node_type) if use_ilocs: return ilocs return self._nodes.ids.from_iloc(ilocs)
def _to_edges(self, edge_arrs): edges = list(zip(*(arr for arr in edge_arrs[:3] if arr is not None))) if edge_arrs[3] is not None: return edges, edge_arrs[3] return edges
[docs] def edges( self, include_edge_type=False, include_edge_weight=False, use_ilocs=False ) -> Iterable[Any]: """ Obtains the collection of edges in the graph. Args: include_edge_type (bool): A flag that indicates whether to return edge types of format (node 1, node 2, edge type) or edge pairs of format (node 1, node 2). include_edge_weight (bool): A flag that indicates whether to return edge weights. Weights are returned in a separate list. use_ilocs (bool): if True return :ref:`ilocs for nodes (and edge types) <iloc-explanation>` Returns: The graph edges. If edge weights are included then a tuple of (edges, weights). """ edge_arrs = self.edge_arrays( include_edge_type, include_edge_weight, use_ilocs=use_ilocs ) return self._to_edges(edge_arrs)
[docs] def edge_arrays( self, include_edge_type=False, include_edge_weight=False, use_ilocs=False ) -> tuple: """ Obtains the collection of edges in the graph as a tuple of arrays (sources, targets, types, weights). ``types`` and ``weights`` will be `None` if the optional parameters are not specified. Args: include_edge_type (bool): A flag that indicates whether to return edge types. include_edge_weight (bool): A flag that indicates whether to return edge weights. use_ilocs (bool): if True return :ref:`ilocs for nodes (and edge types) <iloc-explanation>` Returns: A tuple containing 1D arrays of the source and target nodes (sources, targets, types, weights). Setting include_edge_type and/or include_edge_weight to True will include arrays of edge types and/or edge weights in this tuple, otherwise they will be set to ``None``. """ types = types = self._edges.type_ilocs if include_edge_type else None weights = self._edges.weights if include_edge_weight else None sources = self._edges.sources targets = self._edges.targets if not use_ilocs: sources = self.node_ilocs_to_ids(sources) targets = self.node_ilocs_to_ids(targets) types = self._edges.type_of_iloc(slice(None)) if include_edge_type else None return sources, targets, types, weights
[docs] def has_node(self, node: Any) -> bool: """ Indicates whether or not the graph contains the specified node. Args: node (any): The node. Returns: bool: A value of True (cf False) if the node is (cf is not) in the graph. """ return node in self._nodes
def _transform_edges( self, other_node, ilocs, include_edge_weight, filter_edge_types, use_ilocs ): if include_edge_weight: weights = self._edges.weights[ilocs] else: weights = None if not use_ilocs: other_node = self._nodes.ids.from_iloc(other_node) if filter_edge_types is not None: if not use_ilocs: filter_edge_types = self._edges.types.to_iloc(filter_edge_types) edge_type_ilocs = self._edges.type_ilocs[ilocs] correct_type = np.isin(edge_type_ilocs, filter_edge_types) other_node = other_node[correct_type] if weights is not None: weights = weights[correct_type] if weights is not None: return other_node, weights return other_node def _to_neighbors(self, neigh_arrs, include_edge_weight): if include_edge_weight: return [ NeighbourWithWeight(neigh, weight) for neigh, weight in zip(*neigh_arrs) ] return list(neigh_arrs)
[docs] def neighbor_arrays( self, node: Any, include_edge_weight=False, edge_types=None, use_ilocs=False ): """ Obtains the collection of neighbouring nodes connected to the given node as an array of node_ids. If `include_edge_weight` edge is `True` then an array of edges weights is also returned in a tuple of `(neighbor_ids, edge_weights)`. Args: node (any): The node in question. include_edge_weight (bool, default False): If True an array of edge weights is also returned. edge_types (list of hashable, optional): If provided, only traverse the graph via the provided edge types when collecting neighbours. use_ilocs (bool): if True `node` is treated as a :ref:`node iloc <iloc-explanation>` (and similarly `edge_types` is treated as a edge type ilocs) and the ilocs of each neighbour is returned. Returns: A numpy array of the neighboring nodes. If `include_edge_weight` is `True` then an array of edge weights is also returned in a tuple `(neighbor_array, edge_weight_array)` """ if not use_ilocs: node = self._nodes.ids.to_iloc([node])[0] edge_ilocs = self._edges.edge_ilocs(node, ins=True, outs=True) source = self._edges.sources[edge_ilocs] target = self._edges.targets[edge_ilocs] other_node = np.where(source == node, target, source) return self._transform_edges( other_node, edge_ilocs, include_edge_weight, edge_types, use_ilocs )
[docs] def neighbors( self, node: Any, include_edge_weight=False, edge_types=None, use_ilocs=False ) -> Iterable[any]: """ Obtains the collection of neighbouring nodes connected to the given node. Args: node (any): The node in question. include_edge_weight (bool, default False): If True, each neighbour in the output is a named tuple with fields `node` (the node ID) and `weight` (the edge weight) edge_types (list of hashable, optional): If provided, only traverse the graph via the provided edge types when collecting neighbours. use_ilocs (bool): if True `node` is treated as a :ref:`node iloc <iloc-explanation>` (and similarly `edge_types` is treated as a edge type ilocs) and the ilocs of each neighbour is returned. Returns: iterable: The neighboring nodes. """ neigh_arrs = self.neighbor_arrays( node, include_edge_weight, edge_types, use_ilocs=use_ilocs ) return self._to_neighbors(neigh_arrs, include_edge_weight)
[docs] def in_node_arrays( self, node: Any, include_edge_weight=False, edge_types=None, use_ilocs=False ): """ Obtains the collection of neighbouring nodes with edges directed to the given node. For an undirected graph, neighbours are treated as both in-nodes and out-nodes. Args: node (any): The node in question. include_edge_weight (bool, default False): If True an array of edge weights is also returned. edge_types (list of hashable, optional): If provided, only traverse the graph via the provided edge types when collecting neighbours. use_ilocs (bool): if True `node` is treated as a :ref:`node iloc <iloc-explanation>` (and similarly `edge_types` is treated as a edge type ilocs) and the ilocs of each neighbour is returned. Returns: A numpy array of the neighboring in-nodes. If `include_edge_weight` is `True` then an array of edge weights is also returned in a tuple `(neighbor_array, edge_weight_array)` """ if not self.is_directed(): # all edges are both incoming and outgoing for undirected graphs return self.neighbor_arrays( node, include_edge_weight=include_edge_weight, edge_types=edge_types, use_ilocs=use_ilocs, ) if not use_ilocs: node = self._nodes.ids.to_iloc([node])[0] edge_ilocs = self._edges.edge_ilocs(node, ins=True, outs=False) source = self._edges.sources[edge_ilocs] return self._transform_edges( source, edge_ilocs, include_edge_weight, edge_types, use_ilocs )
[docs] def in_nodes( self, node: Any, include_edge_weight=False, edge_types=None, use_ilocs=False ) -> Iterable[Any]: """ Obtains the collection of neighbouring nodes with edges directed to the given node. For an undirected graph, neighbours are treated as both in-nodes and out-nodes. Args: node (any): The node in question. include_edge_weight (bool, default False): If True, each neighbour in the output is a named tuple with fields `node` (the node ID) and `weight` (the edge weight) edge_types (list of hashable, optional): If provided, only traverse the graph via the provided edge types when collecting neighbours. use_ilocs (bool): if True `node` is treated as a :ref:`node iloc <iloc-explanation>` (and similarly `edge_types` is treated as a edge type ilocs) and the ilocs of each neighbour is returned. Returns: iterable: The neighbouring in-nodes. """ neigh_arrs = self.in_node_arrays( node, include_edge_weight, edge_types, use_ilocs=use_ilocs ) return self._to_neighbors(neigh_arrs, include_edge_weight)
[docs] def out_node_arrays( self, node: Any, include_edge_weight=False, edge_types=None, use_ilocs=False ): """ Obtains the collection of neighbouring nodes with edges directed from the given node. For an undirected graph, neighbours are treated as both in-nodes and out-nodes. Args: node (any): The node in question. include_edge_weight (bool, default False): If True an array of edge weights is also returned. edge_types (list of hashable, optional): If provided, only traverse the graph via the provided edge types when collecting neighbours. use_ilocs (bool): if True `node` is treated as a :ref:`node iloc <iloc-explanation>` (and similarly `edge_types` is treated as a edge type ilocs) and the ilocs of each neighbour is returned. Returns: A numpy array of the neighboring out-nodes. If `include_edge_weight` is `True` then an array of edge weights is also returned in a tuple `(neighbor_array, edge_weight_array)` """ if not self.is_directed(): # all edges are both incoming and outgoing for undirected graphs return self.neighbor_arrays( node, include_edge_weight=include_edge_weight, edge_types=edge_types, use_ilocs=use_ilocs, ) if not use_ilocs: node = self._nodes.ids.to_iloc([node])[0] edge_ilocs = self._edges.edge_ilocs(node, ins=False, outs=True) target = self._edges.targets[edge_ilocs] return self._transform_edges( target, edge_ilocs, include_edge_weight, edge_types, use_ilocs )
[docs] def out_nodes( self, node: Any, include_edge_weight=False, edge_types=None, use_ilocs=False ) -> Iterable[Any]: """ Obtains the collection of neighbouring nodes with edges directed from the given node. For an undirected graph, neighbours are treated as both in-nodes and out-nodes. Args: node (any): The node in question. include_edge_weight (bool, default False): If True, each neighbour in the output is a named tuple with fields `node` (the node ID) and `weight` (the edge weight) edge_types (list of hashable, optional): If provided, only traverse the graph via the provided edge types when collecting neighbours. use_ilocs (bool): if True `node` is treated as a :ref:`node iloc <iloc-explanation>` (and similarly `edge_types` is treated as a edge type ilocs) and the ilocs of each neighbour is returned. Returns: iterable: The neighbouring out-nodes. """ neigh_arrs = self.out_node_arrays( node, include_edge_weight, edge_types, use_ilocs=use_ilocs ) return self._to_neighbors(neigh_arrs, include_edge_weight)
[docs] def nodes_of_type(self, node_type=None): """ Get the nodes of the graph with the specified node types. Args: node_type (hashable): a type of nodes that exist in the graph (this must be passed, omitting it or passing ``None`` is deprecated) Returns: A list of node IDs with type node_type """ warnings.warn( "'nodes_of_type' is deprecated and will be removed; use the 'nodes(type=...)' method instead", DeprecationWarning, stacklevel=2, ) return list(self.nodes(node_type=node_type))
[docs] def node_type(self, node, use_ilocs=False): """ Get the type of the node Args: node: a node or iterable of nodes use_ilocs: if True `node` is treated as a :ref:`node iloc <iloc-explanation>` Returns: Node type or numpy array of node types """ if is_real_iterable(node): nodes = node else: nodes = [node] if not use_ilocs: nodes = self._nodes.ids.to_iloc(nodes, strict=True) type_sequence = self._nodes.type_of_iloc(nodes) if is_real_iterable(node): return type_sequence assert len(type_sequence) == 1 return type_sequence[0]
@property def node_types(self): """ Get a list of all node types in the graph. Returns: set of types """ return set(self._nodes.types.pandas_index) def _unique_type(self, element_data, name, error_message): all_types = element_data.types.pandas_index if len(all_types) == 1: return all_types[0] found = comma_sep(all_types) if error_message is None: error_message = "Expected only one %(name)s type for 'unique_%(name)s_type', found: %(found)s" raise ValueError(error_message % {"name": name, "found": found})
[docs] def unique_node_type(self, error_message=None): """ Return the unique node type, for a homogeneous-node graph. Args: error_message (str, optional): a custom message to use for the exception; this can use the ``%(found)s`` placeholder to insert the real sequence of node types. Returns: If this graph has only one node type, this returns that node type, otherwise it raises a ``ValueError`` exception. """ return self._unique_type(self._nodes, "node", error_message)
@property def edge_types(self): """ Returns: a sequence of all edge types in the graph """ return self._edges.types.pandas_index
[docs] def unique_edge_type(self, error_message=None): """ Return the unique edge type, for a homogeneous-edge graph. Args: error_message (str, optional): a custom message to use for the exception; this can use the ``%(found)s`` placeholder to insert the real sequence of edge types. Returns: If this graph has only one edge type, this returns that edge type, otherwise it raises a ``ValueError`` exception. """ return self._unique_type(self._edges, "edge", error_message)
[docs] def node_type_names_to_ilocs(self, node_type_names): """ Get the :ref:`node type ilocs <iloc-explanation>` for the specified node types. Args: node_type_names (sequence of hashable): node types Returns: Numpy array containing the ilocs of the requested node types. """ return self._nodes.types.to_iloc(node_type_names, strict=True)
[docs] def node_type_ilocs_to_names(self, node_type_ilocs): """ Get the names of the specified :ref:`node type ilocs <iloc-explanation>`. Args: node_type_ilocs (sequence of int): node type ilocs Returns: Numpy array containing the names of the requested node types. """ return self._nodes.types.from_iloc(node_type_ilocs)
[docs] def edge_type_names_to_ilocs(self, edge_type_names): """ Get the :ref:`edge type ilocs <iloc-explanation>` for the specified edge types. Args: edge_type_names (sequence of hashable): edge types Returns: Numpy array containing the ilocs of the requested edge types. """ return self._edges.types.to_iloc(edge_type_names, strict=True)
[docs] def edge_type_ilocs_to_names(self, edge_type_ilocs): """ Get the names of the specified :ref:`edge type ilocs <iloc-explanation>`. Args: edge_type_ilocs (sequence of int): edge type ilocs Returns: Numpy array containing the names of the requested edge types. """ return self._edges.types.from_iloc(edge_type_ilocs)
def _feature_shapes(self, element_data, types): all_sizes = element_data.feature_info() if types is None: types = all_sizes.keys() return {type_name: all_sizes[type_name][0] for type_name in types} def _feature_sizes(self, element_data, types, name): def get(type_name, shape): if len(shape) != 1: raise ValueError( f"{name}_feature_sizes expects {name} types that have feature vectors (rank 1), found type {type_name!r} with feature shape {shape}" ) return shape[0] return { type_name: get(type_name, shape) for type_name, shape in self._feature_shapes(element_data, types).items() }
[docs] def node_feature_sizes(self, node_types=None): """ Get the feature sizes for the specified node types. .. seealso:: :meth:`node_feature_shapes` Args: node_types (list, optional): A list of node types. If None all current node types will be used. Returns: A dictionary of node type and integer feature size. """ return self._feature_sizes(self._nodes, node_types, "node")
[docs] def node_feature_shapes(self, node_types=None): """ Get the feature shapes for the specified node types. .. seealso:: :meth:`node_feature_sizes` Args: node_types (list, optional): A list of node types. If None all current node types will be used. Returns: A dictionary of node type and tuple feature shapes. """ return self._feature_shapes(self._nodes, node_types)
[docs] def edge_feature_sizes(self, edge_types=None): """ Get the feature sizes for the specified edge types. .. seealso:: :meth:`edge_feature_shapes` Args: edge_types (list, optional): A list of edge types. If None all current edge types will be used. Returns: A dictionary of edge type and integer feature size. """ return self._feature_sizes(self._edges, edge_types, "edge")
[docs] def edge_feature_shapes(self, edge_types=None): """ Get the feature shapes for the specified edge types. .. seealso:: :meth:`edge_feature_sizes` Args: edge_types (list, optional): A list of edge types. If None all current edge types will be used. Returns: A dictionary of edge type and tuple feature shapes. """ return self._feature_shapes(self._edges, edge_types)
[docs] def check_graph_for_ml(self, features=True, expensive_check=False): """ Checks if all properties required for machine learning training/inference are set up. An error will be raised if the graph is not correctly setup. """ if all(size == 0 for _, size in self.node_feature_sizes().items()): raise RuntimeError( "This StellarGraph has no numeric feature attributes for nodes" "Node features are required for machine learning" )
# TODO: check the schema # TODO: check the feature node_ids against the graph node ids?
[docs] def node_ids_to_ilocs(self, nodes): """ Get the :ref:`node ilocs <iloc-explanation>` for the specified node or nodes. Args: nodes (list or hashable): node IDs Returns: Numpy array containing the indices for the requested nodes. """ return self._nodes.ids.to_iloc(nodes, strict=True)
[docs] def node_ilocs_to_ids(self, node_ilocs): """ Get the node ids for the specified :ref:`node ilocs <iloc-explanation>`. Args: node_ilocs (list or hashable): :ref:`node ilocs <iloc-explanation>` Returns: Numpy array containing the node ids for the requested nodes. """ return self._nodes.ids.from_iloc(node_ilocs)
[docs] def node_features(self, nodes=None, node_type=None, use_ilocs=False): """ Get the numeric feature vectors for the specified nodes or node type. For graphs with a single node type: - ``graph.node_features()`` to retrieve features of all nodes, in the same order as ``graph.nodes()``. - ``graph.node_features(nodes=some_node_ids)`` to retrieve features for each node in ``some_node_ids``. For graphs with multiple node types: - ``graph.node_features(node_type=some_type)`` to retrieve features of all nodes of type ``some_type``, in the same order as ``graph.nodes(node_type=some_type)``. - ``graph.node_features(nodes=some_node_ids, node_type=some_type)`` to retrieve features for each node in ``some_node_ids``. All of the chosen nodes must be of type ``some_type``. - ``graph.node_features(nodes=some_node_ids)`` to retrieve features for each node in ``some_node_ids``. All of the chosen nodes must be of the same type, which will be inferred. This will be slower than providing the node type explicitly in the previous example. Args: nodes (list or hashable, optional): Node ID or list of node IDs, all of the same type node_type (hashable, optional): the type of the nodes. Returns: Numpy array containing the node features for the requested nodes or node type. """ return extract_element_features( self._nodes, self.unique_node_type, "node", nodes, node_type, use_ilocs )
[docs] def edge_features(self, edges=None, edge_type=None, use_ilocs=False): """ Get the numeric feature vectors for the specified edges or edge type. For graphs with a single edge type: - ``graph.edge_features()`` to retrieve features of all edges, in the same order as ``graph.edges()``. - ``graph.edge_features(edges=some_edge_ids)`` to retrieve features for each edge in ``some_edge_ids``. For graphs with multiple edge types: - ``graph.edge_features(edge_type=some_type)`` to retrieve features of all edges of type ``some_type``, in the same order as ``graph.edges(edge_type=some_type)``. - ``graph.edge_features(edges=some_edge_ids, edge_type=some_type)`` to retrieve features for each edge in ``some_edge_ids``. All of the chosen edges must be of type ``some_type``. - ``graph.edge_features(edges=some_edge_ids)`` to retrieve features for each edge in ``some_edge_ids``. All of the chosen edges must be of the same type, which will be inferred. This will be slower than providing the edge type explicitly in the previous example. Args: edges (list or hashable, optional): Edge ID or list of edge IDs, all of the same type edge_type (hashable, optional): the type of the edges. Returns: Numpy array containing the edge features for the requested edges or edge type. """ return extract_element_features( self._edges, self.unique_edge_type, "edge", edges, edge_type, use_ilocs )
################################################################## # Computationally intensive methods: def _edge_type_iloc_triples(self, selector=slice(None), stacked=False): source_ilocs = self._edges.sources[selector] source_type_ilocs = self._nodes.type_ilocs[source_ilocs] rel_type_ilocs = self._edges.type_ilocs[selector] target_ilocs = self._edges.targets[selector] target_type_ilocs = self._nodes.type_ilocs[target_ilocs] all_ilocs = source_type_ilocs, rel_type_ilocs, target_type_ilocs if stacked: return np.stack(all_ilocs, axis=-1) return all_ilocs def _edge_type_triples(self, selector=slice(None)): src_ilocs, rel_ilocs, tgt_ilocs = self._edge_type_iloc_triples( selector, stacked=False ) return ( self._nodes.types.from_iloc(src_ilocs), self._edges.types.from_iloc(rel_ilocs), self._nodes.types.from_iloc(tgt_ilocs), ) def _unique_type_triples(self, selector=slice(None)): all_type_ilocs = self._edge_type_iloc_triples(selector, stacked=True) if len(all_type_ilocs) == 0: # FIXME(https://github.com/numpy/numpy/issues/15559): if there's no edges, np.unique is # being called on a shape=(0, 3) ndarray, and hits "ValueError: cannot reshape array of # size 0 into shape (0,newaxis)", so we manually reproduce what would be returned ret = None, [] else: ret = np.unique(all_type_ilocs, axis=0, return_index=True) edge_ilocs = ret[1] # we've now got the indices for an edge with each triple, along with the counts of them, so # we can query to get the actual edge types (this is, at the time of writing, easier than # getting the actual type for each type iloc in the triples) unique_ets = self._edge_type_triples(edge_ilocs) return zip(*unique_ets) def _edge_metrics_by_type_triple(self, metrics): src_ty, rel_ty, tgt_ty = self._edge_type_triples() df = pd.DataFrame( { "src_ty": src_ty, "rel_ty": rel_ty, "tgt_ty": tgt_ty, "weight": self._edges.weights, } ) # if graph is undirected, we want to sort the triple if not self.is_directed(): sorted_types = df[["src_ty", "tgt_ty"]].to_numpy() sorted_types.sort(axis=1) df[["src_ty", "tgt_ty"]] = sorted_types return df.groupby(["src_ty", "rel_ty", "tgt_ty"]).agg(metrics)["weight"]
[docs] def info(self, show_attributes=None, sample=None, truncate=20): """ Return an information string summarizing information on the current graph. This includes node and edge type information and their attributes. Note: This requires processing all nodes and edges and could take a long time for a large graph. Args: show_attributes: Deprecated, unused. sample: Deprecated, unused. truncate (int, optional): If an integer, show only the ``truncate`` most common node and edge type triples; if ``None``, list each one individually. Returns: An information string. """ if show_attributes is not None: warnings.warn( "'show_attributes' is no longer used, remove it from the 'info()' call", DeprecationWarning, stacklevel=2, ) if sample is not None: warnings.warn( "'sample' is no longer used, remove it from the 'info()' call", DeprecationWarning, stacklevel=2, ) # always truncate the edge types listed for each node type, since they're redundant with the # individual listing of edge types, and make for a single very long line truncate_edge_types_per_node = 5 if truncate is not None: truncate_edge_types_per_node = min(truncate_edge_types_per_node, truncate) # Numpy processing is much faster than NetworkX processing, so we don't bother sampling. gs = self.create_graph_schema() node_feature_info = self._nodes.feature_info() edge_feature_info = self._edges.feature_info() def str_edge_type(et): n1, rel, n2 = et return f"{n1}-{rel}->{n2}" def str_feature(feature_info, ty): feature_shape, feature_dtype = feature_info[ty] if len(feature_shape) > 1: return f"{feature_dtype.name} tensor, shape {feature_shape}" elif feature_shape[0] == 0: return "none" else: return f"{feature_dtype.name} vector, length {feature_shape[0]}" def str_node_type(count, nt): feature_text = str_feature(node_feature_info, nt) edges = gs.schema[nt] if edges: edge_types = comma_sep( [str_edge_type(et) for et in gs.schema[nt]], limit=truncate_edge_types_per_node, stringify=str, ) else: edge_types = "none" return f"{nt}: [{count}]\n Features: {feature_text}\n Edge types: {edge_types}" def edge_type_info(et, metrics): feature_text = str_feature(edge_feature_info, et.rel) if metrics.min == metrics.max: weights_text = ( "all 1 (default)" if metrics.min == 1 else f"all {metrics.min:.6g}" ) else: weights_text = f"range=[{metrics.min:.6g}, {metrics.max:.6g}], mean={metrics.mean:.6g}, std={metrics.std:.6g}" return f"{str_edge_type(et)}: [{metrics.count}]\n Weights: {weights_text}\n Features: {feature_text}" # sort the node types in decreasing order of frequency node_types = sorted( ((len(self.nodes(node_type=nt)), nt) for nt in gs.node_types), reverse=True ) nodes = separated( [str_node_type(count, nt) for count, nt in node_types], limit=truncate, stringify=str, sep="\n ", ) metric_names = ["count", "min", "max", "mean", "std"] et_metrics = self._edge_metrics_by_type_triple(metrics=metric_names) edge_types = sorted( ( (metrics.count, EdgeType(*metrics.Index), metrics,) for metrics in et_metrics.itertuples() ), reverse=True, ) edges = separated( [edge_type_info(et, metrics) for _, et, metrics in edge_types], limit=truncate, stringify=str, sep="\n ", ) directed_str = "Directed" if self.is_directed() else "Undirected" lines = [ f"{type(self).__name__}: {directed_str} multigraph", f" Nodes: {self.number_of_nodes()}, Edges: {self.number_of_edges()}", "", " Node types:", ] if nodes: lines.append(" " + nodes) lines.append("") lines.append(" Edge types:") if edges: lines.append(" " + edges) return "\n".join(lines)
[docs] def create_graph_schema(self, nodes=None): """ Create graph schema from the current graph. Arguments: nodes (list): A list of node IDs to use to build schema. This must represent all node types and all edge types in the graph. If not specified, all nodes and edges in the graph are used. Returns: GraphSchema object. """ graph_schema = {nt: set() for nt in self.node_types} edge_types = set() if nodes is None: selector = slice(None) else: node_ilocs = self._nodes.ids.to_iloc(nodes) selector = np.isin(self._edges.sources, node_ilocs) & np.isin( self._edges.targets, node_ilocs ) for n1, rel, n2 in self._unique_type_triples(selector=selector): edge_type_tri = EdgeType(n1, rel, n2) edge_types.add(edge_type_tri) graph_schema[n1].add(edge_type_tri) if not self.is_directed(): edge_type_tri = EdgeType(n2, rel, n1) edge_types.add(edge_type_tri) graph_schema[n2].add(edge_type_tri) # Create ordered list of edge_types edge_types = sorted(edge_types) # Create keys for node and edge types schema = { node_label: sorted(node_data) for node_label, node_data in graph_schema.items() } return GraphSchema( self.is_directed(), sorted(self.node_types), edge_types, schema )
[docs] def node_degrees(self, use_ilocs=False) -> Mapping[Any, int]: """ Obtains a map from node to node degree. use_ilocs (bool): if True return :ref:`node ilocs <iloc-explanation>` Returns: The degree of each node. """ degrees = self._edges.degrees() if use_ilocs: return degrees node_ids = self.node_ilocs_to_ids(list(degrees.keys())) return defaultdict( int, ((node_id, degree) for node_id, degree in zip(node_ids, degrees.values())), )
[docs] def to_adjacency_matrix( self, nodes: Optional[Iterable] = None, weighted=False, edge_type=None ): """ Obtains a SciPy sparse adjacency matrix of edge weights. By default (``weighted=False``), each element of the matrix contains the number of edges between the two vertices (only 0 or 1 in a graph without multi-edges). Args: nodes (iterable): The optional collection of nodes comprising the subgraph. If specified, then the adjacency matrix is computed for the subgraph; otherwise, it is computed for the full graph. weighted (bool): If true, use the edge weight column from the graph instead of edge counts (weights from multi-edges are summed). edge_type (hashable, optional): If set (to an edge type), only includes edges of that type, otherwise uses all edges. Returns: The weighted adjacency matrix. """ if edge_type is None: type_selector = slice(None) else: type_selector = self._edges.type_range(edge_type) sources = self._edges.sources[type_selector] targets = self._edges.targets[type_selector] if nodes is None: # if `nodes` is None use overall ilocs (for the original graph) src_idx = sources tgt_idx = targets selector = slice(None) n = self.number_of_nodes() else: node_ilocs = self._nodes.ids.to_iloc(nodes) index = ExternalIdIndex(node_ilocs) n = len(index) selector = np.isin(sources, node_ilocs) & np.isin(targets, node_ilocs) # these indices are computed relative to the index above src_idx = index.to_iloc(sources[selector]) tgt_idx = index.to_iloc(targets[selector]) if weighted: weights = self._edges.weights[type_selector][selector] else: weights = np.ones(src_idx.shape, dtype=self._edges.weights.dtype) adj = sps.csr_matrix((weights, (src_idx, tgt_idx)), shape=(n, n)) if not self.is_directed() and n > 0: # in an undirected graph, the adjacency matrix should be symmetric: which means counting # weights from either "incoming" or "outgoing" edges, but not double-counting self loops # FIXME https://github.com/scipy/scipy/issues/11949: these operations, particularly the # diagonal, don't work for an empty matrix (n == 0) backward = adj.transpose(copy=True) # this is setdiag(0), but faster, since it doesn't change the sparsity structure of the # matrix (https://github.com/scipy/scipy/issues/11600) (nonzero,) = backward.diagonal().nonzero() backward[nonzero, nonzero] = 0 adj += backward # this is a multigraph, let's eliminate any duplicate entries adj.sum_duplicates() return adj
[docs] def subgraph(self, nodes): """ Compute the node-induced subgraph implied by ``nodes``. Args: nodes (iterable): The nodes in the subgraph. Returns: A :class:`.StellarGraph` or :class:`.StellarDiGraph` instance containing only the nodes in ``nodes``, and any edges between them in ``self``. It contains the same node & edge types, node features and edge weights as in ``self``. """ node_ilocs = self._nodes.ids.to_iloc(nodes, strict=True) node_types = self._nodes.type_of_iloc(node_ilocs) node_type_to_ilocs = pd.Series(node_ilocs, index=node_types).groupby(level=0) node_frames = { type_name: pd.DataFrame( self._nodes.features(type_name, ilocs), index=self._nodes.ids.from_iloc(ilocs), ) for type_name, ilocs in node_type_to_ilocs } # FIXME(#985): this is O(edges in graph) but could potentially be optimised to O(edges in # graph incident to `nodes`), which could be much fewer if `nodes` is small edge_ilocs = np.where( np.isin(self._edges.sources, node_ilocs) & np.isin(self._edges.targets, node_ilocs) ) edge_frame = pd.DataFrame( { "id": self._edges.ids.from_iloc(edge_ilocs), globalvar.SOURCE: self._nodes.ids.from_iloc( self._edges.sources[edge_ilocs] ), globalvar.TARGET: self._nodes.ids.from_iloc( self._edges.targets[edge_ilocs] ), globalvar.WEIGHT: self._edges.weights[edge_ilocs], }, index=self._edges.type_of_iloc(edge_ilocs), ) edge_frames = { type_name: df.set_index("id") for type_name, df in edge_frame.groupby(level=0) } cls = StellarDiGraph if self.is_directed() else StellarGraph return cls(node_frames, edge_frames)
[docs] def connected_components(self): """ Compute the connected components in this graph, ordered by size. The nodes in the largest component can be computed with ``nodes = next(graph.connected_components())``. The node IDs returned by this method can be used to compute the corresponding subgraph with ``graph.subgraph(nodes)``. For directed graphs, this computes the weakly connected components. This effectively treating each edge as undirected. Returns: An iterator over sets of node IDs in each connected component, from the largest (most nodes) to smallest (fewest nodes). """ adj = self.to_adjacency_matrix() count, cc_labels = sps.csgraph.connected_components(adj, directed=False) cc_sizes = np.bincount(cc_labels, minlength=count) cc_by_size = np.argsort(cc_sizes)[::-1] return ( self._nodes.ids.from_iloc(cc_labels == cc_label) for cc_label in cc_by_size )
[docs] def to_networkx( self, node_type_attr=globalvar.TYPE_ATTR_NAME, edge_type_attr=globalvar.TYPE_ATTR_NAME, edge_weight_attr=globalvar.WEIGHT, feature_attr=globalvar.FEATURE_ATTR_NAME, node_type_name=None, edge_type_name=None, edge_weight_label=None, feature_name=None, ): """ Create a NetworkX MultiGraph or MultiDiGraph instance representing this graph. Args: node_type_attr (str): the name of the attribute to use to store a node's type (or label). edge_type_attr (str): the name of the attribute to use to store a edge's type (or label). edge_weight_attr (str): the name of the attribute to use to store a edge's weight. feature_attr (str, optional): the name of the attribute to use to store a node's feature vector; if ``None``, feature vectors are not stored within each node. node_type_name (str): Deprecated, use ``node_type_attr``. edge_type_name (str): Deprecated, use ``edge_type_attr``. edge_weight_label (str): Deprecated, use ``edge_weight_attr``. feature_name (str, optional): Deprecated, use ``feature_attr``. Returns: An instance of `networkx.MultiDiGraph` (if directed) or `networkx.MultiGraph` (if undirected) containing all the nodes & edges and their types & features in this graph. """ import networkx if node_type_name is not None: warnings.warn( "the 'node_type_name' parameter has been replaced by 'node_type_attr'", DeprecationWarning, stacklevel=2, ) node_type_attr = node_type_name if edge_type_name is not None: warnings.warn( "the 'edge_type_name' parameter has been replaced by 'edge_type_attr'", DeprecationWarning, stacklevel=2, ) edge_type_attr = edge_type_name if edge_weight_label is not None: warnings.warn( "the 'edge_weight_label' parameter has been replaced by 'edge_weight_attr'", DeprecationWarning, stacklevel=2, ) edge_weight_attr = edge_weight_label if feature_name is not None: warnings.warn( "the 'feature_name' parameter has been replaced by 'feature_attr'", DeprecationWarning, stacklevel=2, ) feature_attr = feature_name if self.is_directed(): graph = networkx.MultiDiGraph() else: graph = networkx.MultiGraph() for ty in self.node_types: node_ids = self.nodes(node_type=ty) ty_dict = {node_type_attr: ty} if feature_attr is not None: features = self.node_features(node_ids, node_type=ty) for node_id, node_features in zip(node_ids, features): graph.add_node( node_id, **ty_dict, **{feature_attr: node_features}, ) else: graph.add_nodes_from(node_ids, **ty_dict) iterator = zip( self._nodes.ids.from_iloc(self._edges.sources), self._nodes.ids.from_iloc(self._edges.targets), self._edges.type_of_iloc(slice(None)), self._edges.weights, ) graph.add_edges_from( (src, dst, {edge_type_attr: type_, edge_weight_attr: weight}) for src, dst, type_, weight in iterator ) return graph
def _adjacency_types(self, graph_schema: GraphSchema, use_ilocs=False): """ Obtains the edges in the form of the typed mapping: {edge_type_triple: {source_node: [target_node, ...]}} Args: graph_schema: The graph schema. use_ilocs (bool): if True return :ref:`node ilocs <iloc-explanation>` Returns: The edge types mapping. """ source_types, rel_types, target_types = self._edge_type_triples(slice(None)) triples = defaultdict(lambda: defaultdict(lambda: [])) sources = self._edges.sources targets = self._edges.targets if not use_ilocs: sources = self._nodes.ids.from_iloc(sources) targets = self._nodes.ids.from_iloc(targets) iterator = zip(source_types, rel_types, target_types, sources, targets,) for src_type, rel_type, tgt_type, src, tgt in iterator: triple = EdgeType(src_type, rel_type, tgt_type) triples[triple][src].append(tgt) if not self.is_directed() and src != tgt: other_triple = EdgeType(tgt_type, rel_type, src_type) triples[other_triple][tgt].append(src) for subdict in triples.values(): for v in subdict.values(): # each list should be in order, to ensure sampling methods are deterministic v.sort(key=str) return triples def _edge_weights( self, source_node: Any, target_node: Any, use_ilocs=False ) -> List[Any]: """ Obtains the weights of edges between the given pair of nodes. Args: source_node (int): The source node. target_node (int): The target node. use_ilocs (bool): if True source_node and target_node are treated as :ref:`node ilocs <iloc-explanation>`. Returns: list: The edge weights. """ # self loops should only be counted once, which means they're effectively always a directed # edge at the storage level, unlikely other edges in an undirected graph. This is # particularly important with the intersection1d call, where the source_ilocs and # target_ilocs will be equal, when source_node == target_node, and thus the intersection # will contain all incident edges. effectively_directed = self.is_directed() or source_node == target_node both_dirs = not effectively_directed if not use_ilocs: source_node = self._nodes.ids.to_iloc([source_node])[0] target_node = self._nodes.ids.to_iloc([target_node])[0] source_edge_ilocs = self._edges.edge_ilocs( source_node, ins=both_dirs, outs=True ) target_edge_ilocs = self._edges.edge_ilocs( target_node, ins=True, outs=both_dirs ) ilocs = np.intersect1d(source_edge_ilocs, target_edge_ilocs, assume_unique=True) return [float(x) for x in self._edges.weights[ilocs]]
# A convenience class that merely specifies that edges have direction.
[docs]class StellarDiGraph(StellarGraph): def __init__( self, nodes=None, edges=None, *, source_column=globalvar.SOURCE, target_column=globalvar.TARGET, edge_weight_column=globalvar.WEIGHT, edge_type_column=None, node_type_default=globalvar.NODE_TYPE_DEFAULT, edge_type_default=globalvar.EDGE_TYPE_DEFAULT, dtype="float32", # legacy arguments graph=None, node_type_name=globalvar.TYPE_ATTR_NAME, edge_type_name=globalvar.TYPE_ATTR_NAME, node_features=None, ): super().__init__( nodes=nodes, edges=edges, is_directed=True, source_column=source_column, target_column=target_column, edge_weight_column=edge_weight_column, edge_type_column=edge_type_column, node_type_default=node_type_default, edge_type_default=edge_type_default, dtype=dtype, # legacy arguments graph=graph, node_type_name=node_type_name, edge_type_name=edge_type_name, node_features=node_features, )