# Source code for stellargraph.data.edge_splitter

```
# -*- coding: utf-8 -*-
#
# Copyright 2017-2020 Data61, CSIRO
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
__all__ = ["EdgeSplitter"]
import datetime
import warnings
import networkx as nx
import pandas as pd
import numpy as np
from math import isclose
from ..core import StellarGraph
from ..globalvar import FEATURE_ATTR_NAME
[docs]class EdgeSplitter(object):
"""
Class for generating training and test data for link prediction in graphs.
The class requires as input a graph (in networkx format) and a percentage as a function of the total number of edges
in the given graph of the number of positive and negative edges to sample. For heterogeneous graphs, the caller
can also specify the type of edge and an edge property to split on. In the latter case, only a date property
can be used and it must be in the format ``dd/mm/yyyy``. A date to be used as a threshold value such that only
edges that have date after the threshold must be given. This effects only the sampling of positive edges.
Negative edges are sampled at random by (for 'global' method) selecting two nodes in the graph and
then checking if these edges are connected or not. If not, the pair of nodes is considered a negative sample.
Otherwise, it is discarded and the process repeats. Alternatively, negative edges are sampled (for 'local' method)
using DFS search at a distance from the source node (selected at random from all nodes in the graph)
sampled according to a given set of probabilities.
Positive edges can be sampled so that when they are subsequently removed from the graph, the reduced graph is either
guaranteed, or not guaranteed, to remain connected. In the former case, graph connectivity is maintained by first
calculating the minimum spanning tree. The edges that belong to the minimum spanning tree are protected from
removal, and therefore cannot be sampled for the training set. The edges that do not belong to the minimum spanning
tree are then sampled uniformly at random, until the required number of positive edges have been sampled for the
training set. In the latter case, when connectedness of the reduced graph is not guaranteed, positive edges are
sampled uniformly at random from all the edges in the graph, regardless of whether they belong to the spanning tree
(which is not calculated in this case).
Args:
g (StellarGraph or networkx object): The graph to sample edges from.
g_master (StellarGraph or networkx object): The graph representing the original dataset and a superset of the
graph g. If it is not None, then when positive and negative edges are sampled, care is taken to make sure
that a true positive edge is not sampled as a negative edge.
"""
def __init__(self, g, g_master=None):
# rather than rewrite this to use StellarGraph natively, this has the desired API (StellarGraphs in and
# StellarGraphs out) by converting to/from NetworkX at the boundaries
self._input_was_stellargraph = isinstance(g, StellarGraph)
if self._input_was_stellargraph:
g = g.to_networkx()
if isinstance(g_master, StellarGraph):
g_master = g_master.to_networkx()
# the original graph copied over
self.g = g.copy()
self.g_master = g_master
# placeholder: it will hold the subgraph of self.g after edges are removed as positive training samples
self.g_train = None
self.positive_edges_ids = None
self.positive_edges_labels = None
self.negative_edges_ids = None
self.negative_edges_labels = None
self.negative_edge_node_distances = None
self.minedges = None # the minimum spanning tree as a list of edges.
self.minedges_set = None # lookup dictionary for edges in minimum spanning tree
self._random = None
def _train_test_split_homogeneous(
self, p, method, probs=None, keep_connected=False
):
"""
Method for edge splitting applied to homogeneous graphs.
Args:
p (float): Percent of edges to be returned. It is calculated as a function of the total number of edges
in the original graph. If the graph is heterogeneous, the percentage is calculated
as a function of the total number of edges that satisfy the edge_label, edge_attribute_label and
edge_attribute_threshold values given.
method (string): Should be 'global' or 'local'. Specifies the method for selecting negative examples.
probs (list of float, optional): If method is 'local' then this vector of floats specifies the probabilities for
sampling at each depth from the source node. The first value should be 0.0 and all values should sum to 1.0.
keep_connected (bool): If True then when positive edges are removed care is taken that the reduced graph
remains connected. If False, positive edges are removed without guaranteeing the connectivity of the reduced graph.
Returns:
2 numpy arrays, the first Nx2 holding the node ids for the edges and the second Nx1 holding the edge
labels, 0 for negative and 1 for positive example.
"""
# minedges are those edges that if removed we might end up with a disconnected graph after the positive edges
# have been sampled.
if keep_connected:
self.minedges = self._get_minimum_spanning_edges()
else:
self.minedges = []
self.minedges_set = set()
# Sample the positive examples
positive_edges = self._reduce_graph(minedges=self.minedges_set, p=p)
df = pd.DataFrame(positive_edges)
self.positive_edges_ids = np.array(df.iloc[:, 0:2])
self.positive_edges_labels = np.array(df.iloc[:, 2])
if method == "global":
negative_edges = self._sample_negative_examples_global(
p=p, limit_samples=len(positive_edges)
)
else: # method == 'local'
if probs is None: # use default values if not given, by warn user
probs = [0.0, 0.25, 0.50, 0.25]
warnings.warn(
"Using default sampling probabilities (distance from source node): {}".format(
probs
),
RuntimeWarning,
stacklevel=2,
)
negative_edges = self._sample_negative_examples_local_dfs(
p=p, probs=probs, limit_samples=len(positive_edges)
)
df = pd.DataFrame(negative_edges)
self.negative_edges_ids = np.array(df.iloc[:, 0:2])
self.negative_edges_labels = np.array(df.iloc[:, 2])
if len(self.positive_edges_ids) == 0:
raise Exception("Could not sample any positive edges")
if len(self.negative_edges_ids) == 0:
raise Exception("Could not sample any negative edges")
edge_data_ids = np.vstack((self.positive_edges_ids, self.negative_edges_ids))
edge_data_labels = np.hstack(
(self.positive_edges_labels, self.negative_edges_labels)
)
print(
"** Sampled {} positive and {} negative edges. **".format(
len(self.positive_edges_ids), len(self.negative_edges_ids)
)
)
return edge_data_ids, edge_data_labels
def _train_test_split_heterogeneous(
self,
p,
method,
edge_label,
probs=None,
keep_connected=False,
edge_attribute_label=None,
edge_attribute_threshold=None,
):
"""
Splitting edge data based on edge type or edge type and edge property. The edge property must be a date in the
format ``dd/mm/yyyy``. If splitting by date, then a threshold value must also be given such that only edges with
date larger than the threshold can be in the set of positive examples. The edge property does not effect the
sampling of negative examples.
Args:
p (float): Percent of edges to be returned. It is calculated as a function of the total number of edges
in the original graph. If the graph is heterogeneous, the percentage is calculated
as a function of the total number of edges that satisfy the edge_label, edge_attribute_label and
edge_attribute_threshold values given.
method (str): Should be 'global' or 'local'. Specifies the method for selecting negative examples.
edge_label (str): The edge type to split on
probs (list of float, optional): If method=='local' then this vector of floats specifies the probabilities for
sampling at each depth from the source node. The first value should be 0.0 and all values should sum to 1.0.
keep_connected (bool): If True then when positive edges are removed care is taken that the reduced graph
remains connected. If False, positive edges are removed without guaranteeing the connectivity of the reduced graph.
edge_attribute_label (str): The label for the edge attribute to split on
edge_attribute_threshold (str, optional): The threshold value applied to the edge attribute when sampling positive
examples
Returns:
2 numpy arrays, the first N × 2 holding the node ids for the edges and the second N × 1 holding the edge
labels, 0 for negative and 1 for positive example.
"""
# minedges are those edges that if removed we might end up with a disconnected graph after the positive edges
# have been sampled.
if keep_connected:
self.minedges = self._get_minimum_spanning_edges()
else:
self.minedges = []
self.minedges_set = set()
# Note: The caller guarantees the edge_label is not None so we don't have to check here again.
if edge_attribute_threshold is None:
positive_edges = self._reduce_graph_by_edge_type(
minedges=self.minedges_set, p=p, edge_label=edge_label
)
else:
positive_edges = self._reduce_graph_by_edge_type_and_attribute(
minedges=self.minedges_set,
p=p,
edge_label=edge_label,
edge_attribute_label=edge_attribute_label,
edge_attribute_threshold=edge_attribute_threshold,
)
if len(positive_edges) == 0:
raise Exception(
"ERROR: Unable to sample any positive edges of type '{}'".format(
edge_label
)
)
df = pd.DataFrame(positive_edges)
self.positive_edges_ids = np.array(df.iloc[:, 0:2])
self.positive_edges_labels = np.array(df.iloc[:, 2])
if method == "global":
negative_edges = self._sample_negative_examples_by_edge_type_global(
p=p,
edges=positive_edges,
edge_label=edge_label,
limit_samples=len(positive_edges),
)
else: # method == 'local'
if probs is None:
probs = [0.0, 0.25, 0.50, 0.25]
warnings.warn(
"Using default sampling probabilities (distance from source node): {}".format(
probs
),
RuntimeWarning,
stacklevel=2,
)
negative_edges = self._sample_negative_examples_by_edge_type_local_dfs(
p=p,
probs=probs,
edges_positive=positive_edges,
edge_label=edge_label,
limit_samples=len(positive_edges),
)
df = pd.DataFrame(negative_edges)
self.negative_edges_ids = np.array(df.iloc[:, 0:2])
self.negative_edges_labels = np.array(df.iloc[:, 2])
if len(self.positive_edges_ids) == 0:
raise Exception("Could not sample any positive edges")
if len(self.negative_edges_ids) == 0:
raise Exception("Could not sample any negative edges")
edge_data_ids = np.vstack((self.positive_edges_ids, self.negative_edges_ids))
edge_data_labels = np.hstack(
(self.positive_edges_labels, self.negative_edges_labels)
)
print(
"** Sampled {} positive and {} negative edges. **".format(
len(self.positive_edges_ids), len(self.negative_edges_ids)
)
)
return edge_data_ids, edge_data_labels
[docs] def train_test_split(
self,
p=0.5,
method="global",
probs=None,
keep_connected=False,
edge_label=None,
edge_attribute_label=None,
edge_attribute_threshold=None,
attribute_is_datetime=None,
seed=None,
):
"""
Generates positive and negative edges and a graph that has the same nodes as the original but the positive
edges removed. It can be used to generate data from homogeneous and heterogeneous graphs.
For heterogeneous graphs, positive and negative examples can be generated based on specified edge type or
edge type and edge property given a threshold value for the latter.
Args:
p (float): Percent of edges to be returned. It is calculated as a function of the total number of edges
in the original graph. If the graph is heterogeneous, the percentage is calculated
as a function of the total number of edges that satisfy the edge_label, edge_attribute_label and
edge_attribute_threshold values given.
method (str): How negative edges are sampled. If 'global', then nodes are selected at random.
If 'local' then the first nodes is sampled from all nodes in the graph, but the second node is
chosen to be from the former's local neighbourhood.
probs (list): list The probabilities for sampling a node that is k-hops from the source node,
e.g., [0.25, 0.75] means that there is a 0.25 probability that the target node will be 1-hope away from the
source node and 0.75 that it will be 2 hops away from the source node. This only affects sampling of
negative edges if method is set to 'local'.
keep_connected (bool): If True then when positive edges are removed care is taken that the reduced graph
remains connected. If False, positive edges are removed without guaranteeing the connectivity of the reduced graph.
edge_label (str, optional) If splitting based on edge type, then this parameter specifies the key for the type
of edges to split on.
edge_attribute_label (str, optional): The label for the edge attribute to split on.
edge_attribute_threshold (str, optional): The threshold value applied to the edge attribute when sampling positive
examples.
attribute_is_datetime (bool, optional): Specifies if edge attribute is datetime or not.
seed (int, optional): seed for random number generator, positive int or 0
Returns:
The reduced graph (positive edges removed) and the edge data as 2 numpy arrays, the first array of
dimensionality N × 2 (where N is the number of edges) holding the node ids for the edges and the second of
dimensionality N × 1 holding the edge labels, 0 for negative and 1 for positive examples. The graph
matches the input graph passed to the :class:`.EdgeSplitter` constructor: the returned graph is a
:class:`.StellarGraph` instance if the input graph was one, and, similarly, a NetworkX graph if the input
graph was one.
"""
if p <= 0 or p >= 1:
raise ValueError("The value of p must be in the interval (0,1)")
if method != "global" and method != "local":
raise ValueError(
"Invalid method {}; valid options are 'local' or 'global'".format(
method
)
)
if not isinstance(keep_connected, (bool,)):
raise ValueError(
"({}) The flag keep_connected be boolean type.".format(
type(self).__name__
)
)
if seed is not None:
if seed < 0:
raise ValueError(
"({}) The random number generator seed value, seed, should be positive integer or None.".format(
type(self).__name__
)
)
if type(seed) != int:
raise ValueError(
"({}) The random number generator seed value, seed, should be integer type or None.".format(
type(self).__name__
)
)
if self._random is None: # only do this one
self._random = np.random.RandomState(seed=seed)
if edge_label is not None: # working with a heterogeneous graph
if (
edge_attribute_label
and edge_attribute_threshold
and not attribute_is_datetime
):
raise ValueError("You can only split by datetime edge attribute")
else: # all three are True
edge_data_ids, edge_data_labels = self._train_test_split_heterogeneous(
p=p,
method=method,
edge_label=edge_label,
edge_attribute_label=edge_attribute_label,
edge_attribute_threshold=edge_attribute_threshold,
keep_connected=keep_connected,
)
else: # working with a homogeneous graph
edge_data_ids, edge_data_labels = self._train_test_split_homogeneous(
p=p, method=method, probs=probs, keep_connected=keep_connected
)
if self._input_was_stellargraph:
# if the graphs came in as a StellarGraph, return one too
result_graph = StellarGraph.from_networkx(
self.g_train, node_features=FEATURE_ATTR_NAME
)
else:
result_graph = self.g_train
return result_graph, edge_data_ids, edge_data_labels
def _get_edges(
self, edge_label, edge_attribute_label=None, edge_attribute_threshold=None
):
"""
Method that filters the edges in the self.g (heterogeneous) graph based on either the edge type
specified by edge_label, or based on edges of edge_label type that have property edge_attribute_label and
the value of the latter property is larger than the edge_attribute_threshold.
Args:
edge_label (str): The type of edges to filter
edge_attribute_label (str, optional): The edge attribute to use for filtering graph edges
edge_attribute_threshold (str, optional): The threshold applied to the edge attribute for filtering edges.
Returns:
(list) List of edges that satisfy the filtering criteria.
"""
# the graph in networkx format is stored in self.g_train
if self.g.is_multigraph():
all_edges = list(self.g.edges(keys=True))
else:
all_edges = list(self.g.edges())
if edge_attribute_label is None or edge_attribute_threshold is None:
# filter by edge_label
edges_with_label = [
e for e in all_edges if self.g.get_edge_data(*e)["label"] == edge_label
]
else:
# filter by edge label, edge attribute and threshold value
edge_attribute_threshold_dt = datetime.datetime.strptime(
edge_attribute_threshold, "%d/%m/%Y"
)
edges_with_label = [
e
for e in all_edges
if (
self.g.get_edge_data(*e)["label"] == edge_label
and datetime.datetime.strptime(
self.g.get_edge_data(*e)[edge_attribute_label], "%d/%m/%Y"
)
> edge_attribute_threshold_dt
)
]
return edges_with_label
def _get_edge_source_and_target_node_types(self, edges):
"""
Method that given a list of edges, for each edge it determines the type of the source and target
nodes and then returns them as a list of tuples.
This routine is necessary because networkx does not provide a direct method for determining the type of nodes
given an edge.
Args:
edges (list): List of edges as returned by networkx graph method edges().
Returns: (list) Returns a list of 2-tuples such that each value in the tuple holds the type (as str) of the
source and target nodes for each element in edges.
"""
# uses self.g_train but any graph object would do since nodes are shared
all_nodes = self.g_train.nodes(data=True)
# dictionary that maps node id to node attributes
all_nodes_as_dict = {n[0]: n[1] for n in all_nodes}
edge_node_types = set()
for edge in edges:
edge_node_types.add(
(
all_nodes_as_dict[edge[0]]["label"],
all_nodes_as_dict[edge[1]]["label"],
)
)
return edge_node_types
def _reduce_graph_by_edge_type_and_attribute(
self,
minedges,
p=0.5,
edge_label=None,
edge_attribute_label=None,
edge_attribute_threshold=None,
):
"""
Reduces the graph self.g_train by a factor p by removing existing edges not on minedges list such that
the reduced tree remains connected. Edges are removed based on the edge type and the values of a given edge
attribute and a threshold applied to the latter.
Args:
minedges (list): Spanning tree edges that cannot be removed.
p (float): Factor by which to reduce the size of the graph.
edge_label (str): The edge type to consider.
edge_attribute_label (str): The edge attribute to consider.
edge_attribute_threshold (str): The threshold value; only edges with attribute value larger than the
threshold can be removed.
Returns:
Returns the list of edges removed from the graph (also modifies the graph self.g_train
by removing the said edges)
"""
# We check that the parameters are given values but we don't check if the graph has edges with label
# edge_label and edge attributes with label edge_attribute_label. For now, we assume that the given values
# are valid; if not, then some cryptic exception is bound to be raised later on in the code.
if edge_label is None:
raise ValueError("edge_label must be specified.")
if edge_attribute_label is None:
raise ValueError("edge_attribute_label must be specified.")
if edge_attribute_threshold is None:
raise ValueError("attribute_threshold must be specified.")
# copy the original graph and start over in case this is not the first time
# reduce_graph has been called.
self.g_train = self.g.copy()
# Filter the graph's edges based on the edge type, edge attribute, and attribute threshold value given.
all_edges = self._get_edges(
edge_label=edge_label,
edge_attribute_label=edge_attribute_label,
edge_attribute_threshold=edge_attribute_threshold,
)
# Also, calculate the number of these edges in the graph.
num_edges_total = len(all_edges)
# print("Graph has {} edges of type {}".format(num_edges_total, edge_label))
# Multiply this number by p to determine the number of positive edge examples to sample
num_edges_to_remove = int(num_edges_total * p)
# shuffle the edges
self._random.shuffle(all_edges)
#
# iterate over the list of edges and for each edge if the edge is not in minedges, remove it from the graph
# until num_edges_to_remove edges have been removed and the graph reduced to p of its original size
count = 0
removed_edges = []
for edge in all_edges:
# Support minedges having keys (NetworkX 2.x) or not (NetworkX 1.x)
if edge not in minedges and (edge[0], edge[1]) not in minedges:
removed_edges.append(
(
edge[0],
edge[1],
1,
) # should this be edge + (1,) to support multigraphs?
) # the last entry is the label
self.g_train.remove_edge(*edge)
count += 1
if count == num_edges_to_remove:
return removed_edges
if len(removed_edges) < num_edges_to_remove:
raise ValueError(
"Unable to sample {} positive edges (could only sample {} positive edges). Consider using smaller value for p or set keep_connected=False".format(
num_edges_to_remove, len(removed_edges)
)
)
def _reduce_graph_by_edge_type(self, minedges, p=0.5, edge_label=None):
"""
Reduces the graph self.g_train by a factor p by removing existing edges not on minedges list such that
the reduced tree remains connected. Edges are removed based on the edge type.
Args:
minedges (list): Minimum spanning tree edges that cannot be removed.
p (float): Factor by which to reduce the size of the graph.
edge_label (str): The edge type to consider.
Returns:
(list) Returns the list of edges removed from self.g_train (also modifies self.g_train by removing said
edges)
"""
if edge_label is None:
raise ValueError("edge_label must be specified")
# copy the original graph and start over in case this is not the first time
# reduce_graph has been called.
self.g_train = self.g.copy()
# Filter the graph's edges based on the specified edge_label
all_edges = self._get_edges(edge_label=edge_label)
num_edges_total = len(all_edges)
print("Network has {} edges of type {}".format(num_edges_total, edge_label))
# Multiply this number by p to determine the number of positive edge examples to sample
num_edges_to_remove = int(num_edges_total * p)
# shuffle the edges
self._random.shuffle(all_edges)
# iterate over the list of filtered edges and for each edge if the edge is not in minedges, remove it from
# the graph until num_edges_to_remove edges have been removed and the graph is reduced to p of its original
# size
count = 0
removed_edges = []
for edge in all_edges:
# Support minedges having keys (NetworkX 2.x) or not (NetworkX 1.x)
if edge not in minedges and (edge[0], edge[1]) not in minedges:
removed_edges.append(
(edge[0], edge[1], 1)
) # the last entry is the label
self.g_train.remove_edge(*edge)
count += 1
if count == num_edges_to_remove:
return removed_edges
if len(removed_edges) < num_edges_to_remove:
raise ValueError(
"Unable to sample {} positive edges (could only sample {} positive edges). Consider using smaller value for p or set keep_connected=False".format(
num_edges_to_remove, len(removed_edges)
)
)
def _reduce_graph(self, minedges, p=0.5):
"""
Reduces the graph self.g_train by a factor p by removing existing edges not on minedges list such that
the reduced tree remains connected. Edge type is ignored and all edges are treated equally.
Args:
minedges (list): Minimum spanning tree edges that cannot be removed.
p (float): Factor by which to reduce the size of the graph.
Returns:
(list) Returns the list of edges removed from self.g_train (also modifies self.g_train by removing the
said edges)
"""
# copy the original graph and start over in case this is not the first time
# reduce_graph has been called.
self.g_train = self.g.copy()
# For multigraphs we should probably use keys
use_keys_in_edges = self.g.is_multigraph()
# For NX 1.x/2.x compatibilty we need to match length of minedges
if len(minedges) > 0:
use_keys_in_edges = len(next(iter(minedges))) == 3
if use_keys_in_edges:
all_edges = list(self.g_train.edges(keys=True))
else:
all_edges = list(self.g_train.edges())
num_edges_to_remove = int(self.g_train.number_of_edges() * p)
if num_edges_to_remove > (self.g_train.number_of_edges() - len(self.minedges)):
raise ValueError(
"Not enough positive edges to sample after reserving {} number of edges for maintaining graph connectivity. Consider setting keep_connected=False.".format(
len(self.minedges)
)
)
# shuffle the edges
self._random.shuffle(all_edges)
# iterate over the list of edges and for each edge if the edge is not in minedges, remove it from the graph
# until num_edges_to_remove edges have been removed and the graph reduced to p of its original size
count = 0
removed_edges = []
for edge in all_edges:
if edge not in minedges:
removed_edges.append(
(edge[0], edge[1], 1)
) # the last entry is the label
self.g_train.remove_edge(*edge)
count += 1
if count == num_edges_to_remove:
return removed_edges
def _sample_negative_examples_by_edge_type_local_dfs(
self,
p=0.5,
probs=None,
edges_positive=None,
edge_label=None,
limit_samples=None,
):
"""
This method produces a list of edges that don't exist in graph self.g (negative examples). The number of
negative edges produced is equal to the number of edges in the graph times p (that should be in the range (0,1]
or limited to maximum limit_samples if the latter is not None. The negative samples are between node types
as inferred from the edge type of the positive examples previously removed from the graph and given in
edges_positive.
This method uses depth-first search to efficiently (memory-wise) sample negative edges based on the local
neighbourhood of randomly (uniformly) sampled source nodes at distances defined by the probabilities in probs.
The source graph is not modified.
Args:
p (float): Factor that multiplies the number of edges in the graph and determines the number of negative
edges to be sampled.
probs (list): Probability distribution for the distance between source and target nodes.
edges_positive (list): The positive edge examples that have previously been removed from the graph
edge_label (str): The edge type to sample negative examples of
limit_samples (int, optional): It limits the maximum number of samples to the given number, if not None
Returns:
(list) A list of 2-tuples that are pairs of node IDs that don't have an edge between them in the graph.
"""
if probs is None:
probs = [0.0, 0.25, 0.50, 0.25]
warnings.warn(
"Using default sampling probabilities up to 3 hops from source node with values {}".format(
probs
)
)
if not isclose(sum(probs), 1.0):
raise ValueError("Sampling probabilities do not sum to 1")
self.negative_edge_node_distances = []
n = len(probs)
# determine the number of edges in the graph that have edge_label type
# Multiply this number by p to determine the number of positive edge examples to sample
all_edges = self._get_edges(edge_label=edge_label)
num_edges_total = len(all_edges)
print("Network has {} edges of type {}".format(num_edges_total, edge_label))
#
num_edges_to_sample = int(num_edges_total * p)
if limit_samples is not None:
if num_edges_to_sample > limit_samples:
num_edges_to_sample = limit_samples
edge_source_target_node_types = self._get_edge_source_and_target_node_types(
edges=edges_positive
)
if self.g_master is None:
edges = self.g.edges()
else:
edges = self.g_master.edges()
# to speed up lookup of edges in edges list, create a set the values stored are the concatenation of
# the source and target node ids.
edges_set = set(edges)
edges_set.update({(e[1], e[0]) for e in edges})
sampled_edges_set = set()
start_nodes = list(self.g.nodes(data=True))
nodes_dict = {node[0]: node[1]["label"] for node in start_nodes}
count = 0
sampled_edges = []
num_iter = int(np.ceil(num_edges_to_sample / (1.0 * len(start_nodes)))) + 1
for _ in np.arange(0, num_iter):
self._random.shuffle(start_nodes)
# sample the distance to the target node using probs
target_node_distances = (
self._random.choice(n, len(start_nodes), p=probs) + 1
)
for u, d in zip(start_nodes, target_node_distances):
# perform DFS search up to d distance from the start node u.
visited = {
node[0]: False for node in start_nodes
} # for marking already visited nodes
nodes_stack = list()
# start at node u
nodes_stack.append((u[0], 0)) # tuple is (node, depth)
while len(nodes_stack) > 0:
next_node = nodes_stack.pop()
v = next_node[0] # retrieve node id
dv = next_node[1] # retrieve node distance from u
if not visited[v]:
visited[v] = True
# Check if this nodes is at depth d; if it is, then this could be selected as the
# target node for a negative edge sample. Otherwise add its neighbours to the stack, only
# if the depth is less than the search depth d.
if dv == d:
u_v_edge_type = (nodes_dict[u[0]], nodes_dict[v])
# if no edge between u and next_node[0] then this is the sample, so record and stop
# searching
# Note: The if statement below is very expensive to evaluate because it need to checks
# the membership of an element in a number of lists that can grow very large for large
# graphs and number examples to sample. Later, we should have a closer look at how we can
# speed this up.
if (
(u_v_edge_type in edge_source_target_node_types)
and (u[0] != v)
and ((u[0], v) not in edges_set)
and ((u[0], v) not in sampled_edges_set)
):
sampled_edges.append(
(u[0], v, 0)
) # the last entry is the class label
sampled_edges_set.add((u[0], v))
sampled_edges_set.add((v, u[0]))
count += 1
self.negative_edge_node_distances.append(d)
break
elif dv < d:
neighbours = list(nx.neighbors(self.g, v))
self._random.shuffle(neighbours)
neighbours = [(k, dv + 1) for k in neighbours]
nodes_stack.extend(neighbours)
if count == num_edges_to_sample:
return sampled_edges
if len(sampled_edges) != num_edges_to_sample:
raise ValueError(
"Unable to sample {} negative edges. Consider using smaller value for p.".format(
num_edges_to_sample
)
)
def _sample_negative_examples_local_dfs(
self, p=0.5, probs=None, limit_samples=None
):
"""
This method produces a list of edges that don't exist in graph self.g (negative examples). The number of
negative edges produced is equal to the number of edges in the graph times p (that should be in the range (0,1]
or limited to maximum limit_samples if the latter is not None.
This method uses depth-first search to efficiently (memory-wise) sample negative edges based on the local
neighbourhood of randomly (uniformly) sampled source nodes at distances defined by the probabilities in probs.
The source graph is not modified.
Args:
p (float): Factor that multiplies the number of edges in the graph and determines the number of no-edges to
be sampled.
probs (list): Probability distribution for the distance between source and target nodes.
limit_samples (int, optional): It limits the maximum number of samples to the given number, if not None
Returns:
(list) A list of 2-tuples that are pairs of node IDs that don't have an edge between them in the graph.
"""
if probs is None:
probs = [0.0, 0.25, 0.50, 0.25]
warnings.warn(
"Using default sampling probabilities up to 3 hops from source node with values {}".format(
probs
),
RuntimeWarning,
)
if not isclose(sum(probs), 1.0):
raise ValueError("Sampling probabilities do not sum to 1")
self.negative_edge_node_distances = []
n = len(probs)
num_edges_to_sample = int(self.g.number_of_edges() * p)
if limit_samples is not None:
if num_edges_to_sample > limit_samples:
num_edges_to_sample = limit_samples
if self.g_master is None:
edges = self.g.edges()
else:
edges = self.g_master.edges()
# to speed up lookup of edges in edges list, create a set the values stored are the concatenation of
# the source and target node ids.
edges_set = set(edges)
edges_set.update({(e[1], e[0]) for e in edges})
sampled_edges_set = set()
start_nodes = list(self.g.nodes(data=False))
count = 0
sampled_edges = []
num_iter = int(np.ceil(num_edges_to_sample / (1.0 * len(start_nodes))))
for _ in np.arange(0, num_iter):
self._random.shuffle(start_nodes)
# sample the distance to the target node using probs
target_node_distances = (
self._random.choice(n, len(start_nodes), p=probs) + 1
)
for u, d in zip(start_nodes, target_node_distances):
# perform DFS search up to d distance from the start node u.
visited = {node: False for node in start_nodes}
nodes_stack = list()
# start at node u
nodes_stack.append((u, 0)) # tuple is node, depth
while len(nodes_stack) > 0:
next_node = nodes_stack.pop()
v = next_node[0]
dv = next_node[1]
if not visited[v]:
visited[v] = True
# Check if this nodes is at depth d; if it is, then this could be selected as the
# target node for a negative edge sample. Otherwise add its neighbours to the stack, only
# if the depth is less than the search depth d.
if dv == d:
# if no edge between u and next_node[0] then this is the sample, so record and stop
# searching
if (
(u != v)
and ((u, v) not in edges_set)
and ((u, v) not in sampled_edges_set)
):
sampled_edges.append(
(u, v, 0)
) # the last entry is the class label
sampled_edges_set.add((u, v))
sampled_edges_set.add((v, u))
count += 1
self.negative_edge_node_distances.append(d)
break
elif dv < d:
neighbours = list(nx.neighbors(self.g, v))
self._random.shuffle(neighbours)
neighbours = [(k, dv + 1) for k in neighbours]
nodes_stack.extend(neighbours)
if count == num_edges_to_sample:
return sampled_edges
if len(sampled_edges) != num_edges_to_sample:
raise ValueError(
"Unable to sample {} negative edges. Consider using smaller value for p.".format(
num_edges_to_sample
)
)
def _sample_negative_examples_global(self, p=0.5, limit_samples=None):
"""
This method samples uniformly at random nodes from the graph and, if they don't have an edge in the graph,
it records the pair as a negative edge.
Args:
p: (float) factor that multiplies the number of edges in the graph and determines the number of negative
edges to be sampled.
limit_samples: (int, optional) it limits the maximum number of samples to the given number, if not None
Returns:
(list) A list of 2-tuples that are pairs of node IDs that don't have an edge between them in the graph.
"""
self.negative_edge_node_distances = []
num_edges_to_sample = int(self.g.number_of_edges() * p)
if limit_samples is not None:
if num_edges_to_sample > limit_samples:
num_edges_to_sample = limit_samples
if self.g_master is None:
edges = list(self.g.edges())
else:
edges = list(self.g_master.edges())
# to speed up lookup of edges in edges list, create a set the values stored are the concatenation of
# the source and target node ids.
edges_set = set(edges)
edges_set.update({(u[1], u[0]) for u in edges})
sampled_edges_set = set()
start_nodes = list(self.g.nodes(data=False))
end_nodes = list(self.g.nodes(data=False))
count = 0
sampled_edges = []
num_iter = int(np.ceil(num_edges_to_sample / (1.0 * len(start_nodes)))) + 1
for _ in np.arange(0, num_iter):
self._random.shuffle(start_nodes)
self._random.shuffle(end_nodes)
for u, v in zip(start_nodes, end_nodes):
if (
(u != v)
and ((u, v) not in edges_set)
and ((u, v) not in sampled_edges_set)
):
sampled_edges.append((u, v, 0)) # the last entry is the class label
sampled_edges_set.update(
{(u, v), (v, u)}
) # test for bi-directional edges
count += 1
if count == num_edges_to_sample:
return sampled_edges
if len(sampled_edges) != num_edges_to_sample:
raise ValueError(
"Unable to sample {} negative edges. Consider using smaller value for p.".format(
num_edges_to_sample
)
)
def _sample_negative_examples_by_edge_type_global(
self, edges, edge_label, p=0.5, limit_samples=None
):
"""
This method produces a list of edges that don't exist in graph self.g (negative examples). The number of
negative edges produced is equal to the number of edges with label edge_label in the graph times p (that should
be in the range (0,1] or limited to maximum limit_samples if the latter is not None. The negative samples are
between node types as inferred from the edge type of the positive examples previously removed from the graph
and given in edges_positive.
The source graph is not modified.
Args:
edges (list): The positive edge examples that have previously been removed from the graph
edge_label (str): The edge type to sample negative examples of
p (float): Factor that multiplies the number of edges in the graph and determines the number of negative
edges to be sampled.
limit_samples (int, optional): It limits the maximum number of samples to the given number, if not None
Returns:
(list) A list of 2-tuples that are pairs of node IDs that don't have an edge between them in the graph.
"""
self.negative_edge_node_distances = []
# determine the number of edges in the graph that have edge_label type
# Multiply this number by p to determine the number of positive edge examples to sample
all_edges = self._get_edges(edge_label=edge_label)
num_edges_total = len(all_edges)
print("Network has {} edges of type {}".format(num_edges_total, edge_label))
#
num_edges_to_sample = int(num_edges_total * p)
if limit_samples is not None:
if num_edges_to_sample > limit_samples:
num_edges_to_sample = limit_samples
edge_source_target_node_types = self._get_edge_source_and_target_node_types(
edges=edges
)
# to speed up lookup of edges in edges list, create a set the values stored are the concatenation of
# the source and target node ids.
edges_set = set(edges)
edges_set.update({(u[1], u[0]) for u in edges})
sampled_edges_set = set()
start_nodes = list(self.g.nodes(data=True))
end_nodes = list(self.g.nodes(data=True))
count = 0
sampled_edges = []
num_iter = int(np.ceil(num_edges_to_sample / (1.0 * len(start_nodes)))) + 1
for _ in np.arange(0, num_iter):
self._random.shuffle(start_nodes)
self._random.shuffle(end_nodes)
for u, v in zip(start_nodes, end_nodes):
u_v_edge_type = (u[1]["label"], v[1]["label"])
if (
(u_v_edge_type in edge_source_target_node_types)
and (u != v)
and ((u[0], v[0]) not in edges_set)
and ((u[0], v[0]) not in sampled_edges_set)
):
sampled_edges.append(
(u[0], v[0], 0)
) # the last entry is the class label
sampled_edges_set.update({(u[0], v[0]), (v[0], u[0])})
count += 1
if count == num_edges_to_sample:
return sampled_edges
if len(sampled_edges) != num_edges_to_sample:
raise ValueError(
"Unable to sample {} negative edges. Consider using smaller value for p.".format(
num_edges_to_sample
)
)
def _get_minimum_spanning_edges(self):
"""
Given an undirected graph, it calculates the minimum set of edges such that graph connectivity is preserved.
Returns:
(list) The minimum spanning edges of the undirected graph self.g
"""
mst = nx.minimum_spanning_edges(self.g, data=False)
edges = list(mst)
# to speed up lookup of edges in edges list, create a set the values stored are the concatenation of
# the source and target node ids.
self.minedges_set = {(u[0], u[1]) for u in edges}
self.minedges_set.update({(u[1], u[0]) for u in edges})
return edges
```