Source code for stellargraph.layer.knowledge_graph

# -*- coding: utf-8 -*-
#
# Copyright 2020 Data61, CSIRO
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import numpy as np
import tensorflow as tf
from tensorflow.keras import backend as K
from tensorflow.keras import activations, initializers, constraints, regularizers
from tensorflow.keras.layers import Input, Layer, Lambda, Dropout, Reshape, Embedding

from .misc import deprecated_model_function
from ..mapper.knowledge_graph import KGTripleGenerator, KGTripleSequence
from ..core.experimental import experimental
from ..core.validation import require_integer_in_range


[docs]class ComplExScore(Layer): """ ComplEx scoring Keras layer. Original Paper: Complex Embeddings for Simple Link Prediction, Théo Trouillon, Johannes Welbl, Sebastian Riedel, Éric Gaussier and Guillaume Bouchard, ICML 2016. http://jmlr.org/proceedings/papers/v48/trouillon16.pdf This combines subject, relation and object embeddings into a score of the likelihood of the link. """ def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs)
[docs] def build(self, input_shape): self.built = True
[docs] def call(self, inputs): """ Applies the layer. Args: inputs: a list of 6 tensors (``shape = batch size × 1 × embedding dimension k``), where the three consecutive pairs represent real and imaginary parts of the subject, relation and object embeddings, respectively, that is, ``inputs == [Re(subject), Im(subject), Re(relation), ...]`` """ s_re, s_im, r_re, r_im, o_re, o_im = inputs def inner(r, s, o): return tf.reduce_sum(r * s * o, axis=2) # expansion of Re(<w_r, e_s, conjugate(e_o)>) score = ( inner(r_re, s_re, o_re) + inner(r_re, s_im, o_im) + inner(r_im, s_re, o_im) - inner(r_im, s_im, o_re) ) return score
[docs]class ComplEx: """ Embedding layers and a ComplEx scoring layers that implement the ComplEx knowledge graph embedding algorithm as in http://jmlr.org/proceedings/papers/v48/trouillon16.pdf Args: generator (KGTripleGenerator): A generator of triples to feed into the model. embedding_dimension (int): the dimension of the embedding (that is, a vector in ``C^embedding_dimension`` is learnt for each node and each link type) embeddings_initializer (str or func, optional): The initialiser to use for the embeddings (the default of random normal values matches the paper's reference implementation). embeddings_regularizer (str or func, optional): The regularizer to use for the embeddings. """ def __init__( self, generator, embedding_dimension, embeddings_initializer="normal", embeddings_regularizer=None, ): if not isinstance(generator, KGTripleGenerator): raise TypeError( f"generator: expected KGTripleGenerator, found {type(generator).__name__}" ) graph = generator.G self.num_nodes = graph.number_of_nodes() self.num_edge_types = len(graph._edges.types) self.embedding_dimension = embedding_dimension def embed(count): return Embedding( count, embedding_dimension, embeddings_initializer=embeddings_initializer, embeddings_regularizer=embeddings_regularizer, ) # ComplEx generates embeddings in C, which we model as separate real and imaginary # embeddings self._node_embeddings_real = embed(self.num_nodes) self._node_embeddings_imag = embed(self.num_nodes) self._edge_type_embeddings_real = embed(self.num_edge_types) self._edge_type_embeddings_imag = embed(self.num_edge_types)
[docs] def embeddings(self): """ Retrieve the embeddings for nodes/entities and edge types/relations in this ComplEx model. Returns: A tuple of numpy complex arrays: the first element is the embeddings for nodes/entities (``shape = number of nodes × k``), the second element is the embeddings for edge types/relations (``shape = number of edge types x k``). """ node = 1j * self._node_embeddings_imag.embeddings.numpy() node += self._node_embeddings_real.embeddings.numpy() rel = 1j * self._edge_type_embeddings_imag.embeddings.numpy() rel += self._edge_type_embeddings_real.embeddings.numpy() return node, rel
[docs] def rank_edges_against_all_nodes(self, test_data, known_edges_graph): """ Returns the ranks of the true edges in ``test_data``, when scored against all other similar edges. For each input edge ``E = (s, r, o)``, the score of the *modified-object* edge ``(s, r, n)`` is computed for every node ``n`` in the graph, and similarly the score of the *modified-subject* edge ``(n, r, o)``. This computes "raw" and "filtered" ranks: raw The score of each edge is ranked against all of the modified-object and modified-subject ones, for instance, if ``E = ("a", "X", "b")`` has score 3.14, and only one modified-object edge has a higher score (e.g. ``F = ("a", "X", "c")``), then the raw modified-object rank for ``E`` will be 2; if all of the ``(n, "X", "b")`` edges have score less than 3.14, then the raw modified-subject rank for ``E`` will be 1. filtered The score of each edge is ranked against only the unknown modified-object and modified-subject edges. An edge is considered known if it is in ``known_edges_graph`` which should typically hold every edge in the dataset (that is everything from the train, test and validation sets, if the data has been split). For instance, continuing the raw example, if the higher-scoring edge ``F`` is in the graph, then it will be ignored, giving a filtered modified-object rank for ``E`` of 1. (If ``F`` was not in the graph, the filtered modified-object rank would be 2.) Args: test_data: the output of :meth:`KGTripleGenerator.flow` on some test triples known_edges_graph (StellarGraph): a graph instance containing all known edges/triples Returns: A numpy array of integer raw ranks. It has shape ``N × 2``, where N is the number of test triples in ``test_data``; the first column (``array[:, 0]``) holds the modified-object ranks, and the second (``array[:, 1]``) holds the modified-subject ranks. """ if not isinstance(test_data, KGTripleSequence): raise TypeError( "test_data: expected KGTripleSequence; found {type(test_data).__name__}" ) num_nodes = known_edges_graph.number_of_nodes() all_node_embs, all_rel_embs = self.embeddings() all_node_embs_conj = all_node_embs.conj() raws = [] filtereds = [] # run through the batches and compute the ranks for each one num_tested = 0 for ((subjects, rels, objects),) in test_data: num_tested += len(subjects) # batch_size x k ss = all_node_embs[subjects, :] rs = all_rel_embs[rels, :] os = all_node_embs[objects, :] # reproduce the scoring function for ranking the given subject and relation against all # other nodes (objects), and similarly given relation and object against all # subjects. The bulk operations give speeeeeeeeed. # (num_nodes x k, batch_size x k) -> num_nodes x batch_size mod_o_pred = np.inner(all_node_embs_conj, ss * rs).real mod_s_pred = np.inner(all_node_embs, rs * os.conj()).real mod_o_raw, mod_o_filt = _ranks_from_score_columns( mod_o_pred, true_modified_node_ilocs=objects, unmodified_node_ilocs=subjects, true_rel_ilocs=rels, modified_object=True, known_edges_graph=known_edges_graph, ) mod_s_raw, mod_s_filt = _ranks_from_score_columns( mod_s_pred, true_modified_node_ilocs=subjects, true_rel_ilocs=rels, modified_object=False, unmodified_node_ilocs=objects, known_edges_graph=known_edges_graph, ) raws.append(np.column_stack((mod_o_raw, mod_s_raw))) filtereds.append(np.column_stack((mod_o_filt, mod_s_filt))) # make one big array raw = np.concatenate(raws) filtered = np.concatenate(filtereds) # for each edge, there should be an pair of raw ranks assert raw.shape == filtered.shape == (num_tested, 2) return raw, filtered
def __call__(self, x): """ Apply embedding layers to the source, relation and object input "ilocs" (sequential integer labels for the nodes and edge types). Args: x (list): list of 3 tensors (each batch size x 1) storing the ilocs of the subject, relation and object elements for each edge in the batch. """ s_iloc, r_iloc, o_iloc = x s_re = self._node_embeddings_real(s_iloc) s_im = self._node_embeddings_imag(s_iloc) r_re = self._edge_type_embeddings_real(r_iloc) r_im = self._edge_type_embeddings_imag(r_iloc) o_re = self._node_embeddings_real(o_iloc) o_im = self._node_embeddings_imag(o_iloc) scoring = ComplExScore() return scoring([s_re, s_im, r_re, r_im, o_re, o_im])
[docs] def in_out_tensors(self): """ Builds a ComplEx model. Returns: A tuple of (list of input tensors, tensor for ComplEx model score outputs) """ s_iloc = Input(shape=1) r_iloc = Input(shape=1) o_iloc = Input(shape=1) x_inp = [s_iloc, r_iloc, o_iloc] x_out = self(x_inp) return x_inp, x_out
build = deprecated_model_function(in_out_tensors, "build")
[docs]class DistMultScore(Layer): """ DistMult scoring Keras layer. Original Paper: Embedding Entities and Relations for Learning and Inference in Knowledge Bases. Bishan Yang, Wen-tau Yih, Xiaodong He, Jianfeng Gao, Li Deng. ICLR 2015 This combines subject, relation and object embeddings into a score of the likelihood of the link. """ def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs)
[docs] def build(self, input_shape): self.built = True
[docs] def call(self, inputs): """ Applies the layer. Args: inputs: a list of 3 tensors (``shape = batch size × 1 × embedding dimension``), representing the subject, relation and object embeddings, respectively, that is, ``inputs == [subject, relation, object]`` """ y_e1, m_r, y_e2 = inputs # y_e1^T M_r y_e2, where M_r = diag(m_r) is a diagonal matrix score = tf.reduce_sum(y_e1 * m_r * y_e2, axis=2) return score
[docs]class DistMult: """ Embedding layers and a DistMult scoring layers that implement the DistMult knowledge graph embedding algorithm as in https://arxiv.org/pdf/1412.6575.pdf Args: generator (KGTripleGenerator): A generator of triples to feed into the model. embedding_dimension (int): the dimension of the embedding (that is, a vector in ``R^embedding_dimension`` is learnt for each node and each link type) embeddings_initializer (str or func, optional): The initialiser to use for the embeddings. embeddings_regularizer (str or func, optional): The regularizer to use for the embeddings. """ def __init__( self, generator, embedding_dimension, embeddings_initializer="uniform", embeddings_regularizer=None, ): if not isinstance(generator, KGTripleGenerator): raise TypeError( f"generator: expected KGTripleGenerator, found {type(generator).__name__}" ) require_integer_in_range(embedding_dimension, "embedding_dimension", min_val=1) graph = generator.G self.num_nodes = graph.number_of_nodes() self.num_edge_types = len(graph._edges.types) self.embedding_dimension = embedding_dimension def embed(count): # FIXME(#980,https://github.com/tensorflow/tensorflow/issues/33755): embeddings can't use # constraints to be normalized: per section 4 in the paper, the embeddings should be # normalised to have unit norm. return Embedding( count, embedding_dimension, embeddings_initializer=embeddings_initializer, embeddings_regularizer=embeddings_regularizer, ) # DistMult generates embeddings in R self._node_embeddings = embed(self.num_nodes) self._edge_type_embeddings = embed(self.num_edge_types)
[docs] def embeddings(self): """ Retrieve the embeddings for nodes/entities and edge types/relations in this DistMult model. Returns: A tuple of numpy arrays: the first element is the embeddings for nodes/entities (``shape = number of nodes × k``), the second element is the embeddings for edge types/relations (``shape = number of edge types x k``). """ return ( self._node_embeddings.embeddings.numpy(), self._edge_type_embeddings.embeddings.numpy(), )
[docs] def rank_edges_against_all_nodes(self, test_data, known_edges_graph): """ Returns the ranks of the true edges in ``test_data``, when scored against all other similar edges. For each input edge ``E = (s, r, o)``, the score of the *modified-object* edge ``(s, r, n)`` is computed for every node ``n`` in the graph, and similarly the score of the *modified-subject* edge ``(n, r, o)``. This computes "raw" and "filtered" ranks: raw The score of each edge is ranked against all of the modified-object and modified-subject ones, for instance, if ``E = ("a", "X", "b")`` has score 3.14, and only one modified-object edge has a higher score (e.g. ``F = ("a", "X", "c")``), then the raw modified-object rank for ``E`` will be 2; if all of the ``(n, "X", "b")`` edges have score less than 3.14, then the raw modified-subject rank for ``E`` will be 1. filtered The score of each edge is ranked against only the unknown modified-object and modified-subject edges. An edge is considered known if it is in ``known_edges_graph`` which should typically hold every edge in the dataset (that is everything from the train, test and validation sets, if the data has been split). For instance, continuing the raw example, if the higher-scoring edge ``F`` is in the graph, then it will be ignored, giving a filtered modified-object rank for ``E`` of 1. (If ``F`` was not in the graph, the filtered modified-object rank would be 2.) Args: test_data: the output of :meth:`KGTripleGenerator.flow` on some test triples known_edges_graph (StellarGraph): a graph instance containing all known edges/triples Returns: A numpy array of integer raw ranks. It has shape ``N × 2``, where N is the number of test triples in ``test_data``; the first column (``array[:, 0]``) holds the modified-object ranks, and the second (``array[:, 1]``) holds the modified-subject ranks. """ if not isinstance(test_data, KGTripleSequence): raise TypeError( "test_data: expected KGTripleSequence; found {type(test_data).__name__}" ) num_nodes = known_edges_graph.number_of_nodes() all_node_embs, all_rel_embs = self.embeddings() raws = [] filtereds = [] # run through the batches and compute the ranks for each one num_tested = 0 for ((subjects, rels, objects),) in test_data: num_tested += len(subjects) # batch_size x k ss = all_node_embs[subjects, :] rs = all_rel_embs[rels, :] os = all_node_embs[objects, :] # reproduce the scoring function for ranking the given subject and relation against all # other nodes (objects), and similarly given relation and object against all # subjects. The bulk operations give speeeeeeeeed. # (num_nodes x k, batch_size x k) -> num_nodes x batch_size mod_o_pred = np.inner(all_node_embs, ss * rs) mod_s_pred = np.inner(all_node_embs, rs * os) mod_o_raw, mod_o_filt = _ranks_from_score_columns( mod_o_pred, true_modified_node_ilocs=objects, unmodified_node_ilocs=subjects, true_rel_ilocs=rels, modified_object=True, known_edges_graph=known_edges_graph, ) mod_s_raw, mod_s_filt = _ranks_from_score_columns( mod_s_pred, true_modified_node_ilocs=subjects, true_rel_ilocs=rels, modified_object=False, unmodified_node_ilocs=objects, known_edges_graph=known_edges_graph, ) raws.append(np.column_stack((mod_o_raw, mod_s_raw))) filtereds.append(np.column_stack((mod_o_filt, mod_s_filt))) # make one big array raw = np.concatenate(raws) filtered = np.concatenate(filtereds) # for each edge, there should be an pair of raw ranks assert raw.shape == filtered.shape == (num_tested, 2) return raw, filtered
def __call__(self, x): """ Apply embedding layers to the source, relation and object input "ilocs" (sequential integer labels for the nodes and edge types). Args: x (list): list of 3 tensors (``shape = batch size × 1``) storing the ilocs of the subject, relation and object elements for each edge in the batch. """ e1_iloc, r_iloc, e2_iloc = x y_e1 = self._node_embeddings(e1_iloc) m_r = self._edge_type_embeddings(r_iloc) y_e2 = self._node_embeddings(e2_iloc) scoring = DistMultScore() return scoring([y_e1, m_r, y_e2])
[docs] def in_out_tensors(self): """ Builds a DistMult model. Returns: A tuple of (list of input tensors, tensor for DistMult model score outputs) """ e1_iloc = Input(shape=(None,)) r_iloc = Input(shape=(None,)) e2_iloc = Input(shape=(None,)) x_inp = [e1_iloc, r_iloc, e2_iloc] x_out = self(x_inp) return x_inp, x_out
build = deprecated_model_function(in_out_tensors, "build")
def _ranks_from_score_columns( pred, *, true_modified_node_ilocs, unmodified_node_ilocs, true_rel_ilocs, modified_object, known_edges_graph, ): """ Compute the raw and filtered ranks of a set of true edges ``E = (s, r, o)`` against all mutations of one end of them, e.g. ``E' = (s, r, n)`` for "modified-object". The raw rank is the total number of edges scored higher than the true edge ``E``, and the filtered rank is the total number of unknown edges (not in ``known_edges_graph``). Args: pred: a 2D array: each column represents the scores for a single true edge and its mutations, where the row indicates the ``n`` in ``E'`` (e.g. row 0 corresponds to ``n`` = node with iloc 0) true_modified_node_ilocs: an array of ilocs of the actual node that was modified, that is, ``o`` for modified-object and ``s`` for modified subject``, index ``i`` corresponds to the iloc for column ``pred[:, i]``. unmodified_node_ilocs: similar to ``true_modified_node_ilocs``, except for the other end of the edge: the node that was not modified. true_rel_ilocs: similar to ``true_modified_node_ilocs``, except for the relationship type of the edge (``r``). modified_object (bool): whether the object was modified (``True``), or the subject (``False``) known_edges_graph (StellarGraph): a graph containing all the known edges that should be ignored when computing filtered ranks Returns: a tuple of raw ranks and filtered ranks, each is an array of integers >= 1 where index ``i`` corresponds to the rank of the true edge among all of the scores in column ``pred[:, i]``. """ batch_size = len(true_modified_node_ilocs) assert pred.shape == (known_edges_graph.number_of_nodes(), batch_size) assert unmodified_node_ilocs.shape == true_rel_ilocs.shape == (batch_size,) # the score of the true edge, for each edge in the batch (this indexes in lock-step, # i.e. [pred[true_modified_node_ilocs[0], range(batch_size)[0]], ...]) true_scores = pred[true_modified_node_ilocs, range(batch_size)] # for each column, compare all the scores against the score of the true edge greater = pred > true_scores # the raw rank is the number of elements scored higher than the true edge raw_rank = 1 + greater.sum(axis=0) # the filtered rank is the number of unknown elements scored higher, where an element is # known if the edge (s, r, n) (for modified-object) or (n, r, o) (for modified-subject) # exists in known_edges_graph. # FIXME(#870): this would be better without external IDs <-> ilocs translation unmodified_nodes = known_edges_graph._nodes.ids.from_iloc(unmodified_node_ilocs) true_rels = known_edges_graph._edges.types.from_iloc(true_rel_ilocs) if modified_object: neigh_func = known_edges_graph.out_nodes else: neigh_func = known_edges_graph.in_nodes # collect all the neighbours into a single array to do one _get_index_for_nodes call, # which has relatively high constant cost neighbours = [] columns = [] for batch_column, (unmodified, r) in enumerate(zip(unmodified_nodes, true_rels)): this_neighs = neigh_func(unmodified, edge_types=[r]) neighbours.extend(this_neighs) columns.extend(batch_column for _ in this_neighs) neighbour_ilocs = known_edges_graph._get_index_for_nodes(neighbours) greater[neighbour_ilocs, columns] = False filtered_rank = 1 + greater.sum(axis=0) assert raw_rank.shape == filtered_rank.shape == (batch_size,) return raw_rank, filtered_rank