# -*- coding: utf-8 -*-
#
# Copyright 2020 Data61, CSIRO
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np
import tensorflow as tf
from tensorflow.keras import backend as K
from tensorflow.keras import activations, initializers, constraints, regularizers
from tensorflow.keras.layers import Input, Layer, Lambda, Dropout, Reshape, Embedding
from .misc import deprecated_model_function
from ..mapper.knowledge_graph import KGTripleGenerator, KGTripleSequence
from ..core.experimental import experimental
from ..core.validation import require_integer_in_range
[docs]class ComplExScore(Layer):
"""
ComplEx scoring Keras layer.
Original Paper: Complex Embeddings for Simple Link Prediction, Théo Trouillon, Johannes Welbl,
Sebastian Riedel, Éric Gaussier and Guillaume Bouchard, ICML
2016. http://jmlr.org/proceedings/papers/v48/trouillon16.pdf
This combines subject, relation and object embeddings into a score of the likelihood of the
link.
"""
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
[docs] def build(self, input_shape):
self.built = True
[docs] def call(self, inputs):
"""
Applies the layer.
Args:
inputs: a list of 6 tensors (``shape = batch size × 1 × embedding dimension k``), where
the three consecutive pairs represent real and imaginary parts of the subject,
relation and object embeddings, respectively, that is, ``inputs == [Re(subject),
Im(subject), Re(relation), ...]``
"""
s_re, s_im, r_re, r_im, o_re, o_im = inputs
def inner(r, s, o):
return tf.reduce_sum(r * s * o, axis=2)
# expansion of Re(<w_r, e_s, conjugate(e_o)>)
score = (
inner(r_re, s_re, o_re)
+ inner(r_re, s_im, o_im)
+ inner(r_im, s_re, o_im)
- inner(r_im, s_im, o_re)
)
return score
[docs]class ComplEx:
"""
Embedding layers and a ComplEx scoring layers that implement the ComplEx knowledge graph
embedding algorithm as in http://jmlr.org/proceedings/papers/v48/trouillon16.pdf
Args:
generator (KGTripleGenerator): A generator of triples to feed into the model.
embedding_dimension (int): the dimension of the embedding (that is, a vector in
``C^embedding_dimension`` is learnt for each node and each link type)
embeddings_initializer (str or func, optional): The initialiser to use for the embeddings
(the default of random normal values matches the paper's reference implementation).
embeddings_regularizer (str or func, optional): The regularizer to use for the embeddings.
"""
def __init__(
self,
generator,
embedding_dimension,
embeddings_initializer="normal",
embeddings_regularizer=None,
):
if not isinstance(generator, KGTripleGenerator):
raise TypeError(
f"generator: expected KGTripleGenerator, found {type(generator).__name__}"
)
graph = generator.G
self.num_nodes = graph.number_of_nodes()
self.num_edge_types = len(graph._edges.types)
self.embedding_dimension = embedding_dimension
def embed(count):
return Embedding(
count,
embedding_dimension,
embeddings_initializer=embeddings_initializer,
embeddings_regularizer=embeddings_regularizer,
)
# ComplEx generates embeddings in C, which we model as separate real and imaginary
# embeddings
self._node_embeddings_real = embed(self.num_nodes)
self._node_embeddings_imag = embed(self.num_nodes)
self._edge_type_embeddings_real = embed(self.num_edge_types)
self._edge_type_embeddings_imag = embed(self.num_edge_types)
[docs] def embeddings(self):
"""
Retrieve the embeddings for nodes/entities and edge types/relations in this ComplEx model.
Returns:
A tuple of numpy complex arrays: the first element is the embeddings for nodes/entities
(``shape = number of nodes × k``), the second element is the embeddings for edge
types/relations (``shape = number of edge types x k``).
"""
node = 1j * self._node_embeddings_imag.embeddings.numpy()
node += self._node_embeddings_real.embeddings.numpy()
rel = 1j * self._edge_type_embeddings_imag.embeddings.numpy()
rel += self._edge_type_embeddings_real.embeddings.numpy()
return node, rel
[docs] def rank_edges_against_all_nodes(self, test_data, known_edges_graph):
"""
Returns the ranks of the true edges in ``test_data``, when scored against all other similar
edges.
For each input edge ``E = (s, r, o)``, the score of the *modified-object* edge ``(s, r, n)``
is computed for every node ``n`` in the graph, and similarly the score of the
*modified-subject* edge ``(n, r, o)``.
This computes "raw" and "filtered" ranks:
raw
The score of each edge is ranked against all of the modified-object and modified-subject
ones, for instance, if ``E = ("a", "X", "b")`` has score 3.14, and only one
modified-object edge has a higher score (e.g. ``F = ("a", "X", "c")``), then the raw
modified-object rank for ``E`` will be 2; if all of the ``(n, "X", "b")`` edges have score
less than 3.14, then the raw modified-subject rank for ``E`` will be 1.
filtered
The score of each edge is ranked against only the unknown modified-object and
modified-subject edges. An edge is considered known if it is in ``known_edges_graph``
which should typically hold every edge in the dataset (that is everything from the train,
test and validation sets, if the data has been split). For instance, continuing the raw
example, if the higher-scoring edge ``F`` is in the graph, then it will be ignored, giving
a filtered modified-object rank for ``E`` of 1. (If ``F`` was not in the graph, the
filtered modified-object rank would be 2.)
Args:
test_data: the output of :meth:`KGTripleGenerator.flow` on some test triples
known_edges_graph (StellarGraph):
a graph instance containing all known edges/triples
Returns:
A numpy array of integer raw ranks. It has shape ``N × 2``, where N is the number of
test triples in ``test_data``; the first column (``array[:, 0]``) holds the
modified-object ranks, and the second (``array[:, 1]``) holds the modified-subject
ranks.
"""
if not isinstance(test_data, KGTripleSequence):
raise TypeError(
"test_data: expected KGTripleSequence; found {type(test_data).__name__}"
)
num_nodes = known_edges_graph.number_of_nodes()
all_node_embs, all_rel_embs = self.embeddings()
all_node_embs_conj = all_node_embs.conj()
raws = []
filtereds = []
# run through the batches and compute the ranks for each one
num_tested = 0
for ((subjects, rels, objects),) in test_data:
num_tested += len(subjects)
# batch_size x k
ss = all_node_embs[subjects, :]
rs = all_rel_embs[rels, :]
os = all_node_embs[objects, :]
# reproduce the scoring function for ranking the given subject and relation against all
# other nodes (objects), and similarly given relation and object against all
# subjects. The bulk operations give speeeeeeeeed.
# (num_nodes x k, batch_size x k) -> num_nodes x batch_size
mod_o_pred = np.inner(all_node_embs_conj, ss * rs).real
mod_s_pred = np.inner(all_node_embs, rs * os.conj()).real
mod_o_raw, mod_o_filt = _ranks_from_score_columns(
mod_o_pred,
true_modified_node_ilocs=objects,
unmodified_node_ilocs=subjects,
true_rel_ilocs=rels,
modified_object=True,
known_edges_graph=known_edges_graph,
)
mod_s_raw, mod_s_filt = _ranks_from_score_columns(
mod_s_pred,
true_modified_node_ilocs=subjects,
true_rel_ilocs=rels,
modified_object=False,
unmodified_node_ilocs=objects,
known_edges_graph=known_edges_graph,
)
raws.append(np.column_stack((mod_o_raw, mod_s_raw)))
filtereds.append(np.column_stack((mod_o_filt, mod_s_filt)))
# make one big array
raw = np.concatenate(raws)
filtered = np.concatenate(filtereds)
# for each edge, there should be an pair of raw ranks
assert raw.shape == filtered.shape == (num_tested, 2)
return raw, filtered
def __call__(self, x):
"""
Apply embedding layers to the source, relation and object input "ilocs" (sequential integer
labels for the nodes and edge types).
Args:
x (list): list of 3 tensors (each batch size x 1) storing the ilocs of the subject,
relation and object elements for each edge in the batch.
"""
s_iloc, r_iloc, o_iloc = x
s_re = self._node_embeddings_real(s_iloc)
s_im = self._node_embeddings_imag(s_iloc)
r_re = self._edge_type_embeddings_real(r_iloc)
r_im = self._edge_type_embeddings_imag(r_iloc)
o_re = self._node_embeddings_real(o_iloc)
o_im = self._node_embeddings_imag(o_iloc)
scoring = ComplExScore()
return scoring([s_re, s_im, r_re, r_im, o_re, o_im])
[docs] def in_out_tensors(self):
"""
Builds a ComplEx model.
Returns:
A tuple of (list of input tensors, tensor for ComplEx model score outputs)
"""
s_iloc = Input(shape=1)
r_iloc = Input(shape=1)
o_iloc = Input(shape=1)
x_inp = [s_iloc, r_iloc, o_iloc]
x_out = self(x_inp)
return x_inp, x_out
build = deprecated_model_function(in_out_tensors, "build")
[docs]class DistMultScore(Layer):
"""
DistMult scoring Keras layer.
Original Paper: Embedding Entities and Relations for Learning and Inference in Knowledge
Bases. Bishan Yang, Wen-tau Yih, Xiaodong He, Jianfeng Gao, Li Deng. ICLR 2015
This combines subject, relation and object embeddings into a score of the likelihood of the
link.
"""
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
[docs] def build(self, input_shape):
self.built = True
[docs] def call(self, inputs):
"""
Applies the layer.
Args:
inputs: a list of 3 tensors (``shape = batch size × 1 × embedding dimension``),
representing the subject, relation and object embeddings, respectively, that is,
``inputs == [subject, relation, object]``
"""
y_e1, m_r, y_e2 = inputs
# y_e1^T M_r y_e2, where M_r = diag(m_r) is a diagonal matrix
score = tf.reduce_sum(y_e1 * m_r * y_e2, axis=2)
return score
[docs]class DistMult:
"""
Embedding layers and a DistMult scoring layers that implement the DistMult knowledge graph
embedding algorithm as in https://arxiv.org/pdf/1412.6575.pdf
Args:
generator (KGTripleGenerator): A generator of triples to feed into the model.
embedding_dimension (int): the dimension of the embedding (that is, a vector in
``R^embedding_dimension`` is learnt for each node and each link type)
embeddings_initializer (str or func, optional): The initialiser to use for the embeddings.
embeddings_regularizer (str or func, optional): The regularizer to use for the embeddings.
"""
def __init__(
self,
generator,
embedding_dimension,
embeddings_initializer="uniform",
embeddings_regularizer=None,
):
if not isinstance(generator, KGTripleGenerator):
raise TypeError(
f"generator: expected KGTripleGenerator, found {type(generator).__name__}"
)
require_integer_in_range(embedding_dimension, "embedding_dimension", min_val=1)
graph = generator.G
self.num_nodes = graph.number_of_nodes()
self.num_edge_types = len(graph._edges.types)
self.embedding_dimension = embedding_dimension
def embed(count):
# FIXME(#980,https://github.com/tensorflow/tensorflow/issues/33755): embeddings can't use
# constraints to be normalized: per section 4 in the paper, the embeddings should be
# normalised to have unit norm.
return Embedding(
count,
embedding_dimension,
embeddings_initializer=embeddings_initializer,
embeddings_regularizer=embeddings_regularizer,
)
# DistMult generates embeddings in R
self._node_embeddings = embed(self.num_nodes)
self._edge_type_embeddings = embed(self.num_edge_types)
[docs] def embeddings(self):
"""
Retrieve the embeddings for nodes/entities and edge types/relations in this DistMult model.
Returns:
A tuple of numpy arrays: the first element is the embeddings for nodes/entities
(``shape = number of nodes × k``), the second element is the embeddings for edge
types/relations (``shape = number of edge types x k``).
"""
return (
self._node_embeddings.embeddings.numpy(),
self._edge_type_embeddings.embeddings.numpy(),
)
[docs] def rank_edges_against_all_nodes(self, test_data, known_edges_graph):
"""
Returns the ranks of the true edges in ``test_data``, when scored against all other similar
edges.
For each input edge ``E = (s, r, o)``, the score of the *modified-object* edge ``(s, r, n)``
is computed for every node ``n`` in the graph, and similarly the score of the
*modified-subject* edge ``(n, r, o)``.
This computes "raw" and "filtered" ranks:
raw
The score of each edge is ranked against all of the modified-object and modified-subject
ones, for instance, if ``E = ("a", "X", "b")`` has score 3.14, and only one
modified-object edge has a higher score (e.g. ``F = ("a", "X", "c")``), then the raw
modified-object rank for ``E`` will be 2; if all of the ``(n, "X", "b")`` edges have score
less than 3.14, then the raw modified-subject rank for ``E`` will be 1.
filtered
The score of each edge is ranked against only the unknown modified-object and
modified-subject edges. An edge is considered known if it is in ``known_edges_graph``
which should typically hold every edge in the dataset (that is everything from the train,
test and validation sets, if the data has been split). For instance, continuing the raw
example, if the higher-scoring edge ``F`` is in the graph, then it will be ignored, giving
a filtered modified-object rank for ``E`` of 1. (If ``F`` was not in the graph, the
filtered modified-object rank would be 2.)
Args:
test_data: the output of :meth:`KGTripleGenerator.flow` on some test triples
known_edges_graph (StellarGraph):
a graph instance containing all known edges/triples
Returns:
A numpy array of integer raw ranks. It has shape ``N × 2``, where N is the number of
test triples in ``test_data``; the first column (``array[:, 0]``) holds the
modified-object ranks, and the second (``array[:, 1]``) holds the modified-subject
ranks.
"""
if not isinstance(test_data, KGTripleSequence):
raise TypeError(
"test_data: expected KGTripleSequence; found {type(test_data).__name__}"
)
num_nodes = known_edges_graph.number_of_nodes()
all_node_embs, all_rel_embs = self.embeddings()
raws = []
filtereds = []
# run through the batches and compute the ranks for each one
num_tested = 0
for ((subjects, rels, objects),) in test_data:
num_tested += len(subjects)
# batch_size x k
ss = all_node_embs[subjects, :]
rs = all_rel_embs[rels, :]
os = all_node_embs[objects, :]
# reproduce the scoring function for ranking the given subject and relation against all
# other nodes (objects), and similarly given relation and object against all
# subjects. The bulk operations give speeeeeeeeed.
# (num_nodes x k, batch_size x k) -> num_nodes x batch_size
mod_o_pred = np.inner(all_node_embs, ss * rs)
mod_s_pred = np.inner(all_node_embs, rs * os)
mod_o_raw, mod_o_filt = _ranks_from_score_columns(
mod_o_pred,
true_modified_node_ilocs=objects,
unmodified_node_ilocs=subjects,
true_rel_ilocs=rels,
modified_object=True,
known_edges_graph=known_edges_graph,
)
mod_s_raw, mod_s_filt = _ranks_from_score_columns(
mod_s_pred,
true_modified_node_ilocs=subjects,
true_rel_ilocs=rels,
modified_object=False,
unmodified_node_ilocs=objects,
known_edges_graph=known_edges_graph,
)
raws.append(np.column_stack((mod_o_raw, mod_s_raw)))
filtereds.append(np.column_stack((mod_o_filt, mod_s_filt)))
# make one big array
raw = np.concatenate(raws)
filtered = np.concatenate(filtereds)
# for each edge, there should be an pair of raw ranks
assert raw.shape == filtered.shape == (num_tested, 2)
return raw, filtered
def __call__(self, x):
"""
Apply embedding layers to the source, relation and object input "ilocs" (sequential integer
labels for the nodes and edge types).
Args:
x (list): list of 3 tensors (``shape = batch size × 1``) storing the ilocs of the
subject, relation and object elements for each edge in the batch.
"""
e1_iloc, r_iloc, e2_iloc = x
y_e1 = self._node_embeddings(e1_iloc)
m_r = self._edge_type_embeddings(r_iloc)
y_e2 = self._node_embeddings(e2_iloc)
scoring = DistMultScore()
return scoring([y_e1, m_r, y_e2])
[docs] def in_out_tensors(self):
"""
Builds a DistMult model.
Returns:
A tuple of (list of input tensors, tensor for DistMult model score outputs)
"""
e1_iloc = Input(shape=(None,))
r_iloc = Input(shape=(None,))
e2_iloc = Input(shape=(None,))
x_inp = [e1_iloc, r_iloc, e2_iloc]
x_out = self(x_inp)
return x_inp, x_out
build = deprecated_model_function(in_out_tensors, "build")
def _ranks_from_score_columns(
pred,
*,
true_modified_node_ilocs,
unmodified_node_ilocs,
true_rel_ilocs,
modified_object,
known_edges_graph,
):
"""
Compute the raw and filtered ranks of a set of true edges ``E = (s, r, o)`` against all
mutations of one end of them, e.g. ``E' = (s, r, n)`` for "modified-object".
The raw rank is the total number of edges scored higher than the true edge ``E``, and the
filtered rank is the total number of unknown edges (not in ``known_edges_graph``).
Args:
pred: a 2D array: each column represents the scores for a single true edge and its
mutations, where the row indicates the ``n`` in ``E'`` (e.g. row 0 corresponds to ``n``
= node with iloc 0)
true_modified_node_ilocs: an array of ilocs of the actual node that was modified, that is,
``o`` for modified-object and ``s`` for modified subject``, index ``i`` corresponds to
the iloc for column ``pred[:, i]``.
unmodified_node_ilocs: similar to ``true_modified_node_ilocs``, except for the other end of
the edge: the node that was not modified.
true_rel_ilocs: similar to ``true_modified_node_ilocs``, except for the relationship type of
the edge (``r``).
modified_object (bool): whether the object was modified (``True``), or the subject
(``False``)
known_edges_graph (StellarGraph): a graph containing all the known edges that should be
ignored when computing filtered ranks
Returns:
a tuple of raw ranks and filtered ranks, each is an array of integers >= 1 where index ``i``
corresponds to the rank of the true edge among all of the scores in column ``pred[:, i]``.
"""
batch_size = len(true_modified_node_ilocs)
assert pred.shape == (known_edges_graph.number_of_nodes(), batch_size)
assert unmodified_node_ilocs.shape == true_rel_ilocs.shape == (batch_size,)
# the score of the true edge, for each edge in the batch (this indexes in lock-step,
# i.e. [pred[true_modified_node_ilocs[0], range(batch_size)[0]], ...])
true_scores = pred[true_modified_node_ilocs, range(batch_size)]
# for each column, compare all the scores against the score of the true edge
greater = pred > true_scores
# the raw rank is the number of elements scored higher than the true edge
raw_rank = 1 + greater.sum(axis=0)
# the filtered rank is the number of unknown elements scored higher, where an element is
# known if the edge (s, r, n) (for modified-object) or (n, r, o) (for modified-subject)
# exists in known_edges_graph.
# FIXME(#870): this would be better without external IDs <-> ilocs translation
unmodified_nodes = known_edges_graph._nodes.ids.from_iloc(unmodified_node_ilocs)
true_rels = known_edges_graph._edges.types.from_iloc(true_rel_ilocs)
if modified_object:
neigh_func = known_edges_graph.out_nodes
else:
neigh_func = known_edges_graph.in_nodes
# collect all the neighbours into a single array to do one _get_index_for_nodes call,
# which has relatively high constant cost
neighbours = []
columns = []
for batch_column, (unmodified, r) in enumerate(zip(unmodified_nodes, true_rels)):
this_neighs = neigh_func(unmodified, edge_types=[r])
neighbours.extend(this_neighs)
columns.extend(batch_column for _ in this_neighs)
neighbour_ilocs = known_edges_graph._get_index_for_nodes(neighbours)
greater[neighbour_ilocs, columns] = False
filtered_rank = 1 + greater.sum(axis=0)
assert raw_rank.shape == filtered_rank.shape == (batch_size,)
return raw_rank, filtered_rank