Source code for stellargraph.layer.knowledge_graph

# -*- coding: utf-8 -*-
#
# Copyright 2020 Data61, CSIRO
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import abc

import numpy as np
import tensorflow as tf
from tensorflow.keras import backend as K
from tensorflow.keras import activations, initializers, constraints, regularizers
from tensorflow.keras.layers import Input, Layer, Lambda, Dropout, Reshape, Embedding

from .misc import deprecated_model_function
from ..mapper.knowledge_graph import KGTripleGenerator, KGTripleSequence
from ..core.experimental import experimental
from ..core.validation import require_integer_in_range, comma_sep
from ..utils.hyperbolic import *


class KGModel:
    def __init__(
        self,
        generator,
        scoring,
        embedding_dimension,
        *,
        embeddings_initializer,
        embeddings_regularizer,
    ):
        if not isinstance(generator, KGTripleGenerator):
            raise TypeError(
                f"generator: expected KGTripleGenerator, found {type(generator).__name__}"
            )

        if not isinstance(scoring, KGScore):
            raise TypeError(
                f"scoring: expected KGScore subclass, found {type(scoring).__name__}"
            )

        require_integer_in_range(embedding_dimension, "embedding_dimension", min_val=1)

        graph = generator.G
        self.num_nodes = graph.number_of_nodes()
        self.num_edge_types = len(graph._edges.types)

        self._scoring = scoring

        embeddings = scoring.embeddings(
            self.num_nodes,
            self.num_edge_types,
            embedding_dimension,
            embeddings_initializer,
            embeddings_regularizer,
        )

        self._validate_embeddings(embeddings)
        self._node_embs, self._edge_type_embs = embeddings

    def _validate_embeddings(self, embeddings):
        def error(found):
            raise ValueError(
                f"scoring: expected 'embeddings' method to return two lists of tf.keras.layers.Embedding layers, found {found}"
            )

        if len(embeddings) != 2:
            error(f"a sequence of length {len(embeddings)}")

        a, b = embeddings

        if not all(isinstance(x, list) for x in embeddings):
            error(f"a pair with types ({type(a).__name__}, {type(b).__name__})")

        if not all(isinstance(x, Embedding) for x in a + b):
            a_types = comma_sep(a, stringify=lambda x: type(x).__name__)
            b_types = comma_sep(b, stringify=lambda x: type(x).__name__)
            error(f"a pair of lists containing types ([{a_types}], [{b_types}])")

        # all good!
        return

    def embedding_arrays(self):
        """
        Retrieve each separate set of embeddings for nodes/entities and edge types/relations in this model.

        Returns:
            A tuple of lists of numpy arrays: the first element contains the embeddings for nodes/entities (for each element, ``shape
            = number of nodes × k``), the second element contains the embeddings for edge types/relations
            (``shape = number of edge types x k``), where ``k`` is some notion of the embedding
            dimension for each layer. The type of the embeddings depends on the specific scoring function chosen.
        """
        node = [e.embeddings.numpy() for e in self._node_embs]
        edge_type = [e.embeddings.numpy() for e in self._edge_type_embs]
        return self._scoring.embeddings_to_numpy(node, edge_type)

    def embeddings(self):
        """
        Retrieve the embeddings for nodes/entities and edge types/relations in this model, if there's only one set of embeddings for each of nodes and edge types.

        Returns:
            A tuple of numpy arrays: the first element is the embeddings for nodes/entities (``shape
            = number of nodes × k``), the second element is the embeddings for edge types/relations
            (``shape = number of edge types x k``), where ``k`` is some notion of the embedding
            dimension. The type of the embeddings depends on the specific scoring function chosen.
        """
        node, edge_type = self.embedding_arrays()
        if len(node) != 1 and len(edge_type) != 1:
            raise ValueError(
                f"embeddings: expected a single embedding array for nodes and for edge types from embedding_arrays, found {len(node)} node and {len(edge_type)} edge type arrays; use embedding_arrays to retrieve the lists instead"
            )

        return node[0], edge_type[0]

    def __call__(self, x):
        """
        Apply embedding layers to the source, relation and object input "ilocs" (sequential integer
        labels for the nodes and edge types).

        Args:
            x (list): list of 3 tensors (each batch size x 1) storing the ilocs of the subject,
                relation and object elements for each edge in the batch.
        """
        s_iloc, r_iloc, o_iloc = x

        sequenced = [
            (s_iloc, self._node_embs),
            (r_iloc, self._edge_type_embs),
            (o_iloc, self._node_embs),
        ]

        inp = [
            emb_layer(ilocs)
            for ilocs, emb_layers in sequenced
            for emb_layer in emb_layers
        ]

        return self._scoring(inp)

    def in_out_tensors(self):
        """
        Builds a knowledge graph model.

        Returns:
            A tuple of (list of input tensors, tensor for ComplEx model score outputs)
        """
        s_iloc = Input(shape=1)
        r_iloc = Input(shape=1)
        o_iloc = Input(shape=1)

        x_inp = [s_iloc, r_iloc, o_iloc]
        x_out = self(x_inp)

        return x_inp, x_out

    def rank_edges_against_all_nodes(
        self, test_data, known_edges_graph, tie_breaking="random"
    ):
        """
        Returns the ranks of the true edges in ``test_data``, when scored against all other similar
        edges.

        For each input edge ``E = (s, r, o)``, the score of the *modified-object* edge ``(s, r, n)``
        is computed for every node ``n`` in the graph, and similarly the score of the
        *modified-subject* edge ``(n, r, o)``.

        This computes "raw" and "filtered" ranks:

        raw
          The score of each edge is ranked against all of the modified-object and modified-subject
          ones, for instance, if ``E = ("a", "X", "b")`` has score 3.14, and only one
          modified-object edge has a higher score (e.g. ``F = ("a", "X", "c")``), then the raw
          modified-object rank for ``E`` will be 2; if all of the ``(n, "X", "b")`` edges have score
          less than 3.14, then the raw modified-subject rank for ``E`` will be 1.

        filtered
          The score of each edge is ranked against only the unknown modified-object and
          modified-subject edges. An edge is considered known if it is in ``known_edges_graph``
          which should typically hold every edge in the dataset (that is everything from the train,
          test and validation sets, if the data has been split). For instance, continuing the raw
          example, if the higher-scoring edge ``F`` is in the graph, then it will be ignored, giving
          a filtered modified-object rank for ``E`` of 1. (If ``F`` was not in the graph, the
          filtered modified-object rank would be 2.)

        Args:
            test_data: the output of :meth:`KGTripleGenerator.flow` on some test triples

            known_edges_graph (StellarGraph):
                a graph instance containing all known edges/triples

            tie_breaking ('random', 'top' or 'bottom'):
                How to rank true edges that tie with modified-object or modified-subject ones, see
                `Sun et al. "A Re-evaluation of Knowledge Graph Completion Methods"
                <http://arxiv.org/abs/1911.03903>`_

        Returns:
            A numpy array of integer raw ranks. It has shape ``N × 2``, where N is the number of
            test triples in ``test_data``; the first column (``array[:, 0]``) holds the
            modified-object ranks, and the second (``array[:, 1]``) holds the modified-subject
            ranks.
        """

        if not isinstance(test_data, KGTripleSequence):
            raise TypeError(
                "test_data: expected KGTripleSequence; found {type(test_data).__name__}"
            )

        num_nodes = known_edges_graph.number_of_nodes()

        node_embs, edge_type_embs = self.embedding_arrays()
        extra_data = self._scoring.bulk_scoring_data(node_embs, edge_type_embs)

        raws = []
        filtereds = []

        # run through the batches and compute the ranks for each one
        num_tested = 0
        for ((subjects, rels, objects),) in test_data:
            num_tested += len(subjects)

            # batch_size x k
            ss = [e[subjects, :] for e in node_embs]
            rs = [e[rels, :] for e in edge_type_embs]
            os = [e[objects, :] for e in node_embs]

            mod_o_pred, mod_s_pred = self._scoring.bulk_scoring(
                node_embs, extra_data, ss, rs, os,
            )

            mod_o_raw, mod_o_filt = _ranks_from_score_columns(
                mod_o_pred,
                true_modified_node_ilocs=objects,
                unmodified_node_ilocs=subjects,
                true_rel_ilocs=rels,
                modified_object=True,
                known_edges_graph=known_edges_graph,
                tie_breaking=tie_breaking,
            )
            mod_s_raw, mod_s_filt = _ranks_from_score_columns(
                mod_s_pred,
                true_modified_node_ilocs=subjects,
                true_rel_ilocs=rels,
                modified_object=False,
                unmodified_node_ilocs=objects,
                known_edges_graph=known_edges_graph,
                tie_breaking=tie_breaking,
            )

            raws.append(np.column_stack((mod_o_raw, mod_s_raw)))
            filtereds.append(np.column_stack((mod_o_filt, mod_s_filt)))

        # make one big array
        raw = np.concatenate(raws)
        filtered = np.concatenate(filtereds)
        # for each edge, there should be an pair of raw ranks
        assert raw.shape == filtered.shape == (num_tested, 2)

        return raw, filtered


class KGScore(abc.ABC):
    @abc.abstractmethod
    def embeddings(
        self, num_nodes, num_edge_types, dimension, initializer, regularizer
    ):
        """
        Create appropriate embedding layer(s) for this scoring.

        Args:
            num_nodes: the number of nodes in this graph.
            num_edge_types: the number of edge types/relations in this graph.
            dimension: the requested embedding dimension, for whatever that means for this scoring.
            initializer: the initializer to use for embeddings, when required.
            regularizer: the regularizer to use for embeddings, when required.

        Returns:
            A pair of lists of :class:`tensorflow.keras.layers.Embedding` layers, corresponding to
            nodes and edge types.
        """
        ...

    def embeddings_to_numpy(self, node_embs, edge_type_embs):
        """
        Convert raw embedding NumPy arrays into "semantic" embeddings, such as complex numbers instead
        of interleaved real numbers.

        Args:
            node_embs: ``num_nodes × k`` array of all node embeddings, where ``k`` is the size of
                the embeddings returned by :meth:embeddings_to_numpy`.
            edge_type_embs: ``num_edge_type × k`` array of all edge type/relation embeddings, where
                ``k`` is the size of the embeddings returned by :meth:embeddings_to_numpy`.

        Returns:
            Model-specific NumPy arrays corresponding to some useful view of the embeddings vectors.
        """
        return node_embs, edge_type_embs

    def bulk_scoring_data(self, node_embs, edge_type_embs):
        """
        Pre-compute some data for bulk ranking, if any such data would be helpful.
        """
        return None

    @abc.abstractmethod
    def bulk_scoring(
        self, node_embs, extra_data, s_embs, r_embs, o_embs,
    ):
        """
        Compute a batch of modified-object and modified-subject scores for ranking.

        Args:
            node_embs: ``num_nodes × k`` array of all node embeddings, where ``k`` is the size of
                the embeddings returned by :meth:embeddings_to_numpy`.

            extra_data: the return value of :meth:`bulk_scoring_data`

            s_embs: ``batch_size × k`` embeddings for the true source nodes
            r_embs: ``batch_size × k`` embeddings for the true edge types/relations
            o_embs: ``batch_size × k`` embeddings for the true object nodes

        Returns:
            This should return a pair of NumPy arrays of shape ``num_nodes × batch_size``. The first
            array contains scores of the modified-object edges, and the second contains scores of
            the modified-subject edges.
        """
        ...

    # this isn't a subclass of Keras Layer, because a model or other combination of individual
    # layers is okay too, but this model will be applied by calling the instance
    @abc.abstractmethod
    def __call__(self, inputs):
        """
        Apply this scoring mechanism to the selected values from the embedding layers.

        Args:
            inputs: a list of tensors selected from each of the embedding layers, concatenated like
                ``[source, source, ..., edge types, edge_types, ..., object, object, ...]``
        """
        ...


def _numpy_complex(arrays):
    emb = 1j * arrays[1]
    emb += arrays[0]
    return emb


[docs]class ComplExScore(Layer, KGScore): """ ComplEx scoring Keras layer. Original Paper: Complex Embeddings for Simple Link Prediction, Théo Trouillon, Johannes Welbl, Sebastian Riedel, Éric Gaussier and Guillaume Bouchard, ICML 2016. http://jmlr.org/proceedings/papers/v48/trouillon16.pdf This combines subject, relation and object embeddings into a score of the likelihood of the link. """ def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs)
[docs] def embeddings( self, num_nodes, num_edge_types, dimension, initializer, regularizer ): def embed(count): return Embedding( count, dimension, embeddings_initializer=initializer, embeddings_regularizer=regularizer, ) # ComplEx generates embeddings in C, which we model as separate real and imaginary # embeddings nodes = [embed(num_nodes), embed(num_nodes)] edge_types = [embed(num_edge_types), embed(num_edge_types)] return nodes, edge_types
[docs] def embeddings_to_numpy(self, node_embs, edge_type_embs): return ( [_numpy_complex(node_embs)], [_numpy_complex(edge_type_embs)], )
[docs] def bulk_scoring_data(self, node_embs, edge_type_embs): return node_embs[0].conj()
[docs] def bulk_scoring( self, node_embs, node_embs_conj, s_embs, r_embs, o_embs, ): node_embs = node_embs[0] s_embs = s_embs[0] r_embs = r_embs[0] o_embs = o_embs[0] mod_o_pred = np.inner(node_embs_conj, s_embs * r_embs).real mod_s_pred = np.inner(node_embs, r_embs * o_embs.conj()).real return mod_o_pred, mod_s_pred
[docs] def build(self, input_shape): self.built = True
[docs] def call(self, inputs): """ Applies the layer. Args: inputs: a list of 6 tensors (``shape = batch size × 1 × embedding dimension k``), where the three consecutive pairs represent real and imaginary parts of the subject, relation and object embeddings, respectively, that is, ``inputs == [Re(subject), Im(subject), Re(relation), ...]`` """ s_re, s_im, r_re, r_im, o_re, o_im = inputs def inner(r, s, o): return tf.reduce_sum(r * s * o, axis=2) # expansion of Re(<w_r, e_s, conjugate(e_o)>) score = ( inner(r_re, s_re, o_re) + inner(r_re, s_im, o_im) + inner(r_im, s_re, o_im) - inner(r_im, s_im, o_re) ) return score
[docs]class ComplEx(KGModel): """ Embedding layers and a ComplEx scoring layers that implement the ComplEx knowledge graph embedding algorithm as in http://jmlr.org/proceedings/papers/v48/trouillon16.pdf .. seealso:: Example using ComplEx: `link prediction <https://stellargraph.readthedocs.io/en/stable/demos/link-prediction/complex-link-prediction.html>`__ Related models: other knowledge graph models, see :class:`.KGTripleGenerator` for a full list. Appropriate data generator: :class:`.KGTripleGenerator`. Args: generator (KGTripleGenerator): A generator of triples to feed into the model. embedding_dimension (int): the dimension of the embedding (that is, a vector in ``C^embedding_dimension`` is learnt for each node and each link type) embeddings_initializer (str or func, optional): The initialiser to use for the embeddings (the default of random normal values matches the paper's reference implementation). embeddings_regularizer (str or func, optional): The regularizer to use for the embeddings. """ def __init__( self, generator, embedding_dimension, embeddings_initializer="normal", embeddings_regularizer=None, ): super().__init__( generator, ComplExScore(), embedding_dimension=embedding_dimension, embeddings_initializer=embeddings_initializer, embeddings_regularizer=embeddings_regularizer, ) build = deprecated_model_function(KGModel.in_out_tensors, "build")
[docs]class DistMultScore(Layer, KGScore): """ DistMult scoring Keras layer. Original Paper: Embedding Entities and Relations for Learning and Inference in Knowledge Bases. Bishan Yang, Wen-tau Yih, Xiaodong He, Jianfeng Gao, Li Deng. ICLR 2015 This combines subject, relation and object embeddings into a score of the likelihood of the link. """ def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs)
[docs] def embeddings( self, num_nodes, num_edge_types, dimension, initializer, regularizer ): def embed(count): # FIXME(#980,https://github.com/tensorflow/tensorflow/issues/33755): embeddings can't # use constraints to be normalized: per section 4 in the paper, the embeddings should be # normalised to have unit norm. return Embedding( count, dimension, embeddings_initializer=initializer, embeddings_regularizer=regularizer, ) # DistMult generates embeddings in R nodes = [embed(num_nodes)] edge_types = [embed(num_edge_types)] return nodes, edge_types
[docs] def bulk_scoring( self, all_n_embs, _extra_data, s_embs, r_embs, o_embs, ): all_n_embs = all_n_embs[0] s_embs = s_embs[0] r_embs = r_embs[0] o_embs = o_embs[0] mod_o_pred = np.inner(all_n_embs, s_embs * r_embs) mod_s_pred = np.inner(all_n_embs, r_embs * o_embs) return mod_o_pred, mod_s_pred
[docs] def build(self, input_shape): self.built = True
[docs] def call(self, inputs): """ Applies the layer. Args: inputs: a list of 3 tensors (``shape = batch size × 1 × embedding dimension``), representing the subject, relation and object embeddings, respectively, that is, ``inputs == [subject, relation, object]`` """ y_e1, m_r, y_e2 = inputs # y_e1^T M_r y_e2, where M_r = diag(m_r) is a diagonal matrix score = tf.reduce_sum(y_e1 * m_r * y_e2, axis=2) return score
[docs]class DistMult(KGModel): """ Embedding layers and a DistMult scoring layers that implement the DistMult knowledge graph embedding algorithm as in https://arxiv.org/pdf/1412.6575.pdf .. seealso:: Example using DistMult: `link prediction <https://stellargraph.readthedocs.io/en/stable/demos/link-prediction/distmult-link-prediction.html>`__ Related models: other knowledge graph models, see :class:`.KGTripleGenerator` for a full list. Appropriate data generator: :class:`.KGTripleGenerator`. Args: generator (KGTripleGenerator): A generator of triples to feed into the model. embedding_dimension (int): the dimension of the embedding (that is, a vector in ``R^embedding_dimension`` is learnt for each node and each link type) embeddings_initializer (str or func, optional): The initialiser to use for the embeddings. embeddings_regularizer (str or func, optional): The regularizer to use for the embeddings. """ def __init__( self, generator, embedding_dimension, embeddings_initializer="uniform", embeddings_regularizer=None, ): super().__init__( generator, DistMultScore(), embedding_dimension=embedding_dimension, embeddings_initializer=embeddings_initializer, embeddings_regularizer=embeddings_regularizer, ) build = deprecated_model_function(KGModel.in_out_tensors, "build")
[docs]class RotatEScore(Layer, KGScore): def __init__(self, margin, norm_order, **kwargs): super().__init__(**kwargs) self._margin = margin self._norm_order = norm_order
[docs] def embeddings( self, num_nodes, num_edge_types, dimension, initializer, regularizer ): def embed(count, reg=regularizer): return Embedding( count, dimension, embeddings_initializer=initializer, embeddings_regularizer=reg, ) # RotatE generates embeddings in C, which we model as separate real and imaginary # embeddings for node types, and just the phase for edge types (since they have |x| = 1) nodes = [embed(num_nodes), embed(num_nodes)] # it doesn't make sense to regularize the phase, because it's circular edge_types = [embed(num_edge_types, reg=None)] return nodes, edge_types
[docs] def embeddings_to_numpy(self, node_embs, edge_type_embs): nodes = _numpy_complex(node_embs) edge_types = 1j * np.sin(edge_type_embs[0]) edge_types += np.cos(edge_type_embs[0]) return [nodes], [edge_types]
[docs] def bulk_scoring( self, all_n_embs, _extra_data, s_embs, r_embs, o_embs, ): all_n_embs = all_n_embs[0] s_embs = s_embs[0] r_embs = r_embs[0] o_embs = o_embs[0] # (the margin is a fixed offset that doesn't affect relative ranks) mod_o_pred = -np.linalg.norm( (s_embs * r_embs)[None, :, :] - all_n_embs[:, None, :], ord=self._norm_order, axis=2, ) mod_s_pred = -np.linalg.norm( all_n_embs[:, None, :] * r_embs[None, :, :] - o_embs[None, :, :], ord=self._norm_order, axis=2, ) return mod_o_pred, mod_s_pred
[docs] def get_config(self): return { **super().get_config(), "margin": self._margin, "norm_order": self._norm_order, }
[docs] def call(self, inputs): s_re, s_im, r_phase, o_re, o_im = inputs r_re = tf.math.cos(r_phase) r_im = tf.math.sin(r_phase) # expansion of s◦r - t re = s_re * r_re - s_im * r_im - o_re im = s_re * r_im + s_im * r_re - o_im # norm the vector: -|| ... ||_p return self._margin - tf.norm( tf.sqrt(re * re + im * im), ord=self._norm_order, axis=2 )
[docs]@experimental(reason="demo and documentation is missing", issues=[1549, 1550]) class RotatE(KGModel): """ Implementation of https://arxiv.org/abs/1902.10197 .. seealso:: Related models: other knowledge graph models, see :class:`.KGTripleGenerator` for a full list. Appropriate data generator: :class:`.KGTripleGenerator`. """ def __init__( self, generator, embedding_dimension, # default taken from the paper's code: https://github.com/DeepGraphLearning/KnowledgeGraphEmbedding margin=12.0, # default taken from the paper's code: https://github.com/DeepGraphLearning/KnowledgeGraphEmbedding norm_order=2, embeddings_initializer="normal", embeddings_regularizer=None, ): super().__init__( generator, RotatEScore(margin=margin, norm_order=norm_order), embedding_dimension, embeddings_initializer=embeddings_initializer, embeddings_regularizer=embeddings_regularizer, )
class RotHEScore(Layer, KGScore): def __init__(self, hyperbolic): self._hyperbolic = hyperbolic if self._hyperbolic: self._convert = lambda c, v: poincare_ball_exp(c, None, v) self._add = poincare_ball_mobius_add self._squared_distance = lambda c, v, w: tf.square( poincare_ball_distance(c, v, w) ) else: self._convert = lambda _c, v: v self._add = lambda _c, v, w: v + w self._squared_distance = lambda _c, v, w: tf.reduce_sum( tf.math.squared_difference(v, w), axis=-1 ) super().__init__() def embeddings( self, num_nodes, num_edge_types, dimension, initializer, regularizer ): if dimension % 2 != 0: raise ValueError( f"embedding_dimension: expected an even integer, found {dimension}" ) def embed(count, dim=dimension): return Embedding( count, dim, embeddings_initializer=initializer, embeddings_regularizer=regularizer, ) nodes = [embed(num_nodes), embed(num_nodes, 1)] edge_types = [embed(num_edge_types), embed(num_edge_types, dimension // 2)] return nodes, edge_types def build(self, input_shapes): if self._hyperbolic: self.curvature_prime = self.add_weight(shape=(1,), name="curvature_prime") else: self.curvature_prime = None super().build(input_shapes) def _curvature(self): assert self.built if not self._hyperbolic: return tf.constant([0.0]) return tf.math.softplus(self.curvature_prime) def _rotate(self, theta, emb): shape = tf.maximum(tf.shape(theta), tf.shape(emb)) # manual rotation matrix cos = tf.math.cos(theta) sin = tf.math.sin(theta) evens = cos * emb[..., ::2] - sin * emb[..., 1::2] odds = sin * emb[..., ::2] + cos * emb[..., 1::2] return tf.reshape(tf.stack([evens, odds], axis=-1), shape) def call(self, inputs): e_s, b_s, r_r, theta_r, e_o, b_o = inputs curvature = self._curvature() b_s = tf.squeeze(b_s, axis=-1) b_o = tf.squeeze(b_o, axis=-1) eh_s = self._convert(curvature, e_s) rh_r = self._convert(curvature, r_r) eh_o = self._convert(curvature, e_o) rotated_s = self._rotate(theta_r, eh_s) d = self._squared_distance( curvature, self._add(curvature, rotated_s, rh_r), eh_o ) return -d + b_s + b_o def bulk_scoring( self, all_n_embs, _extra_data, s_embs, r_embs, o_embs, ): curvature = self._curvature() e_all, b_all = all_n_embs e_all = e_all[:, None, :] b_all = b_all[:, None, 0] e_s, b_s = s_embs e_s = e_s[None, :, :] b_s = b_s[None, :, 0] r_r, theta_r = r_embs r_r = r_r[None, :, :] theta_r = theta_r[None, :, :] e_o, b_o = o_embs e_o = e_o[None, :, :] b_o = b_o[None, :, 0] eh_s = self._convert(curvature, e_s) rh_r = self._convert(curvature, r_r) rotated_s = self._rotate(theta_r, eh_s) d_mod_o = self._squared_distance( curvature, self._add(curvature, rotated_s, rh_r), e_all ) mod_o_pred = -d_mod_o + b_s + b_all del eh_s, d_mod_o, rotated_s eh_o = self._convert(curvature, e_o) eh_all = self._convert(curvature, e_all) rotated_all = self._rotate(theta_r, eh_all) d_mod_s = self._squared_distance( curvature, self._add(curvature, rotated_all, rh_r), e_o ) mod_s_pred = -d_mod_s + b_all + b_o return mod_o_pred.numpy(), mod_s_pred.numpy()
[docs]@experimental(reason="demo is missing", issues=[1664]) class RotH(KGModel): """ Embedding layers and a RotH scoring layer that implement the RotH knowledge graph embedding algorithm as in https://arxiv.org/abs/2005.00545 .. seealso:: Related models: - other knowledge graph models, see :class:`.KGTripleGenerator` for a full list - :class:`.RotE` for the Euclidean version of this hyperbolic model Appropriate data generator: :class:`.KGTripleGenerator`. Args: generator (KGTripleGenerator): A generator of triples to feed into the model. embedding_dimension (int): the dimension of the embeddings (that is, a vector in ``R^embedding_dimension`` plus a bias in ``R`` is learnt for each node, along with a pair of vectors in ``R^embedding_dimension`` and ``R^(embedding_dimension / 2)`` for each node type). It must be even. embeddings_initializer (str or func, optional): The initialiser to use for the embeddings. embeddings_regularizer (str or func, optional): The regularizer to use for the embeddings. """ def __init__( self, generator, embedding_dimension, embeddings_initializer="normal", embeddings_regularizer=None, ): super().__init__( generator, RotHEScore(hyperbolic=True), embedding_dimension=embedding_dimension, embeddings_initializer=embeddings_initializer, embeddings_regularizer=embeddings_regularizer, )
[docs]@experimental(reason="demo is missing", issues=[1664]) class RotE(KGModel): """ Embedding layers and a RotE scoring layer that implement the RotE knowledge graph embedding algorithm as in https://arxiv.org/pdf/2005.00545.pdf .. seealso:: Related models: - other knowledge graph models, see :class:`.KGTripleGenerator` for a full list - :class:`.RotH` for the hyperbolic version of this Euclidean model Appropriate data generator: :class:`.KGTripleGenerator`. Args: generator (KGTripleGenerator): A generator of triples to feed into the model. embedding_dimension (int): the dimension of the embeddings (that is, a vector in ``R^embedding_dimension`` plus a bias in ``R`` is learnt for each node, along with a pair of vectors in ``R^embedding_dimension`` and ``R^(embedding_dimension / 2)`` for each node type). It must be even. embeddings_initializer (str or func, optional): The initialiser to use for the embeddings. embeddings_regularizer (str or func, optional): The regularizer to use for the embeddings. """ def __init__( self, generator, embedding_dimension, embeddings_initializer="normal", embeddings_regularizer=None, ): super().__init__( generator, RotHEScore(hyperbolic=False), embedding_dimension=embedding_dimension, embeddings_initializer=embeddings_initializer, embeddings_regularizer=embeddings_regularizer, )
def _ranks_from_comparisons(greater, greater_equal, tie_breaking): strict = 1 + greater.sum(axis=0) # with_ties - strict = the number of elements exactly equal (including the true edge itself) with_ties = greater_equal.sum(axis=0) if tie_breaking == "top": return strict elif tie_breaking == "bottom": return with_ties elif tie_breaking == "random": return np.random.randint(strict, with_ties + 1) else: raise ValueError( f"tie_breaking: expected 'top', 'bottom' or 'random', found {tie_breaking!r}" ) def _ranks_from_score_columns( pred, *, true_modified_node_ilocs, unmodified_node_ilocs, true_rel_ilocs, modified_object, known_edges_graph, tie_breaking, ): """ Compute the raw and filtered ranks of a set of true edges ``E = (s, r, o)`` against all mutations of one end of them, e.g. ``E' = (s, r, n)`` for "modified-object". The raw rank is the total number of edges scored higher than the true edge ``E``, and the filtered rank is the total number of unknown edges (not in ``known_edges_graph``). Args: pred: a 2D array: each column represents the scores for a single true edge and its mutations, where the row indicates the ``n`` in ``E'`` (e.g. row 0 corresponds to ``n`` = node with iloc 0) true_modified_node_ilocs: an array of ilocs of the actual node that was modified, that is, ``o`` for modified-object and ``s`` for modified subject``, index ``i`` corresponds to the iloc for column ``pred[:, i]``. unmodified_node_ilocs: similar to ``true_modified_node_ilocs``, except for the other end of the edge: the node that was not modified. true_rel_ilocs: similar to ``true_modified_node_ilocs``, except for the relationship type of the edge (``r``). modified_object (bool): whether the object was modified (``True``), or the subject (``False``) known_edges_graph (StellarGraph): a graph containing all the known edges that should be ignored when computing filtered ranks Returns: a tuple of raw ranks and filtered ranks, each is an array of integers >= 1 where index ``i`` corresponds to the rank of the true edge among all of the scores in column ``pred[:, i]``. """ batch_size = len(true_modified_node_ilocs) assert pred.shape == (known_edges_graph.number_of_nodes(), batch_size) assert unmodified_node_ilocs.shape == true_rel_ilocs.shape == (batch_size,) # the score of the true edge, for each edge in the batch (this indexes in lock-step, # i.e. [pred[true_modified_node_ilocs[0], range(batch_size)[0]], ...]) true_scores = pred[true_modified_node_ilocs, range(batch_size)] # for each column, compare all the scores against the score of the true edge greater = pred > true_scores greater_equal = pred >= true_scores # the raw rank is the number of elements scored higher than the true edge raw_rank = _ranks_from_comparisons(greater, greater_equal, tie_breaking) # the filtered rank is the number of unknown elements scored higher, where an element is # known if the edge (s, r, n) (for modified-object) or (n, r, o) (for modified-subject) # exists in known_edges_graph. if modified_object: neigh_func = known_edges_graph.out_nodes else: neigh_func = known_edges_graph.in_nodes for batch_column, (unmodified, r) in enumerate( zip(unmodified_node_ilocs, true_rel_ilocs) ): this_neighs = neigh_func(unmodified, edge_types=[r], use_ilocs=True) greater[this_neighs, batch_column] = False greater_equal[this_neighs, batch_column] = False # the actual elements should be counted as equal, whether or not it was a known edge or not greater_equal[true_modified_node_ilocs, range(batch_size)] = True filtered_rank = _ranks_from_comparisons(greater, greater_equal, tie_breaking) assert raw_rank.shape == filtered_rank.shape == (batch_size,) return raw_rank, filtered_rank