Source code for stellargraph.utils.calibration

# -*- coding: utf-8 -*-
#
# Copyright 2018-2019 Data61, CSIRO
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""
Calibration for classification, binary and multi-class, models.
"""

__all__ = [
    "IsotonicCalibration",
    "TemperatureCalibration",
    "expected_calibration_error",
    "plot_reliability_diagram",
]

import numpy as np
import matplotlib.pyplot as plt

import tensorflow as tf
from sklearn.isotonic import IsotonicRegression

from sklearn.linear_model import LogisticRegression


[docs]def expected_calibration_error(prediction_probabilities, accuracy, confidence):
    """
    Helper function for calculating the expected calibration error as defined in
    the paper On Calibration of Modern Neural Networks, C. Guo, et. al., ICML, 2017

    It is assumed that for a validation dataset, the prediction probabilities have
    been calculated for each point in the dataset and given in the array
    prediction_probabilities.

    Args:
        prediction_probabilities (numpy array):  The predicted probabilities.
        accuracy (numpy array): The accuracy such that the i-th entry in the array holds the proportion of correctly
            classified samples that fall in the i-th bin.
        confidence (numpy array): The confidence such that the i-th entry in the array is the average prediction
            probability over all the samples assigned to this bin.

    Returns:
        float: The expected calibration error.

    """
    if not isinstance(prediction_probabilities, np.ndarray):
        raise ValueError(
            "Parameter prediction_probabilities must be type numpy.ndarray but given object of type {}".format(
                type(prediction_probabilities)
            )
        )
    if not isinstance(accuracy, np.ndarray):
        raise ValueError(
            "Parameter accuracy must be type numpy.ndarray but given object of type {}".format(
                type(accuracy)
            )
        )
    if not isinstance(confidence, np.ndarray):
        raise ValueError(
            "Parameter confidence must be type numpy.ndarray but given object of type {}".format(
                type(confidence)
            )
        )

    if len(accuracy) != len(confidence):
        raise ValueError(
            "Arrays accuracy and confidence should have the same size but instead received {} and {} respectively.".format(
                len(accuracy), len(confidence)
            )
        )

    n_bins = len(accuracy)  # the number of bins
    n = len(prediction_probabilities)  # number of samples
    h = np.histogram(a=prediction_probabilities, range=(0, 1), bins=n_bins)[
        0
    ]  # just the counts
    ece = 0
    for m in np.arange(n_bins):
        ece = ece + (h[m] / n) * np.abs(accuracy[m] - confidence[m])
    return ece


[docs]def plot_reliability_diagram(calibration_data, predictions, ece=None, filename=None):
    """
    Helper function for plotting a reliability diagram.

    Args:
        calibration_data (list): The calibration data as a list where each entry in the list is a 2-tuple of type
            numpy.ndarray. Each entry in the tuple holds the fraction of positives and the mean predicted values
            for the true and predicted class labels.
        predictions (np.ndarray): The probabilistic predictions of the classifier for each sample in the dataset used
            for diagnosing miscalibration.
        ece (None or list of float): If not None, this list stores the expected calibration error for each class.
        filename (str or None): If not None, the figure is saved on disk in the given filename.
    """
    if not isinstance(calibration_data, list):
        raise ValueError(
            "Parameter calibration_data should be list of 2-tuples but received type {}".format(
                type(calibration_data)
            )
        )

    if not isinstance(predictions, np.ndarray):
        raise ValueError(
            "Parameter predictions should be of type numpy.ndarray but received type {}".format(
                type(predictions)
            )
        )
    if ece is not None and not isinstance(ece, list):
        raise ValueError(
            "Parameter ece should be None or list of floating point numbers but received type {}".format(
                type(ece)
            )
        )
    if filename is not None and not isinstance(filename, str):
        raise ValueError(
            "Parameter filename should be None or str type but received type {}".format(
                type(filename)
            )
        )

    fig = plt.figure(figsize=(12, 8))
    ax1 = plt.subplot2grid((6, 1), (0, 0), rowspan=3)
    ax2 = plt.subplot2grid((6, 1), (4, 0))

    if ece is not None:
        calibration_error = ",".join(format(e, " 0.4f") for e in ece)

    for i, data in enumerate(calibration_data):
        fraction_of_positives, mean_predicted_value = data
        # print(fraction_of_positives, mean_predicted_value)
        ax1.plot(mean_predicted_value, fraction_of_positives, "s-", alpha=1.0)
        if ece is not None:
            ax1.set_title("Calibration Curve (ECE={})".format(calibration_error))
        ax1.set_xlabel("Mean Predicted Value", fontsize=16)
        ax1.set_ylabel("Fraction of Positives", fontsize=16)
        ax1.plot([0, 1], [0, 1], "g--")
        ax2.hist(predictions[:, i], range=(0, 1), bins=10, histtype="step", lw=2)
        ax2.set_xlabel("Bin", fontsize=16)
        ax2.set_ylabel("Count", fontsize=16)
        if filename is not None:
            fig.savefig(filename, bbox_inches="tight")


[docs]class TemperatureCalibration(object):
    """
    A class for temperature calibration for binary and multi-class classification problems.

    For binary classification, Platt Scaling is used for calibration. Platt Scaling was
    proposed in the paper Probabilistic outputs for support vector machines and comparisons to regularized
    likelihood methods, J. C. Platt, Advances in large margin classifiers, 10(3): 61-74, 1999.

    For multi-class classification, Temperature Calibration is used. It is an extension of Platt Scaling
    and it was proposed in the paper On Calibration of Modern Neural Networks, C. Guo et. al., ICML, 2017.

    In Temperature Calibration, a classifier's non-probabilistic outputs, i.e., logits, are
    scaled by a trainable parameter called Temperature. The softmax is applied to the rescaled
    logits to calculate the probabilistic output. As noted in the cited paper, Temperature
    Scaling does not change the maximum of the softmax function so the classifier's prediction
    remain the same.
    """

    def __init__(self, epochs=1000):
        self.epochs = epochs
        self.n_classes = None
        self.temperature = 1.0  # default is no scaling
        self.history = []
        self.early_stopping = False
        self.lr = None  # The logistic regression model for Platt scaling

    def _fit_temperature_scaling(self, x_train, y_train, x_val=None, y_val=None):
        """
        Train the calibration model using Temperature Scaling.

        If validation data is given, then training stops when the validation accuracy starts increasing.

        Args:
            x_train (numpy array): The training data that should be a classifier's non-probabilistic outputs. It should
                have shape (N, C) where N is the number of samples and C is the number of classes.
            y_train (numpy array): The training data class labels. It should have shape (N, C) where N is the number
                of samples and C is the number of classes and the class labels are one-hot encoded.
            x_val (numpy array or None): The validation data used for early stopping. It should have shape (M, C) where
                M is the number of validation samples and C is the number of classes and the class labels are one-hot
                encoded.
            y_val (numpy array or None): The validation data class labels. It should have shape (M, C) where M is the
                number of validation samples and C is the number of classes and the class labels are one-hot encoded.
        """
        with tf.variable_scope(tf.get_variable_scope(), reuse=tf.AUTO_REUSE):
            x = tf.placeholder(
                tf.float32, [None, self.n_classes], name="x"
            )  # input are the model logits
            y = tf.placeholder(
                tf.float32, [None, self.n_classes], name="y"
            )  # output is one-hot encoded true class labels

            T = tf.get_variable(
                "T", [1], initializer=tf.ones_initializer
            )  # the temperature

            scaled_logits = tf.multiply(
                name="z", x=x, y=1.0 / T
            )  # logits scaled by inverse T

            # cost function to optimise
            cost = tf.reduce_mean(
                tf.nn.softmax_cross_entropy_with_logits_v2(
                    logits=scaled_logits, labels=y
                )
            )

        optimizer = tf.train.AdamOptimizer(learning_rate=0.001).minimize(cost)

        init = tf.global_variables_initializer()
        sess = tf.Session()
        sess.run(init)
        self.history = []
        for epoch in range(self.epochs):
            _, c, t = sess.run([optimizer, cost, T], feed_dict={x: x_train, y: y_train})
            if self.early_stopping:
                c_val = sess.run([cost], feed_dict={x: x_val, y: y_val})
                if len(self.history) > 0 and c_val > self.history[-1][1]:
                    break
                else:  # keep going
                    self.history.append([c, c_val[0], t[0]])
            else:
                self.history.append([c, t[0]])
        self.history = np.array(self.history)
        self.temperature = self.history[-1, -1]

    def _fit_platt_scaling(self, x_train, y_train):
        """
        Helper method for calibration of a binary classifier using Platt Scaling.

        Args:
            x_train (numpy array): The training data that should be a classifier's non-probabilistic outputs. It
                should have shape (N,) where N is the number of training samples.
            y_train (numpy array): The training data class labels. It should have shape (N,) where N is the number
                of training samples.

        """

        self.lr = LogisticRegression(fit_intercept=True, verbose=False)

        self.lr.fit(x_train, y_train)

[docs]    def fit(self, x_train, y_train, x_val=None, y_val=None):
        """
        Train the calibration model.

        For temperature scaling of a multi-class classifier, If validation data is given, then
        training stops when the validation accuracy starts increasing. Validation data are ignored for Platt scaling

        Args:
            x_train (numpy array): The training data that should be a classifier's non-probabilistic outputs. For
                calibrating a binary classifier it should have shape (N,) where N is the number of training samples.
                For calibrating a multi-class classifier, it should have shape (N, C) where N is the number of samples
                and C is the number of classes.
            y_train (numpy array): The training data class labels. For
                calibrating a binary classifier it should have shape (N,) where N is the number of training samples.
                For calibrating a multi-class classifier, it should have shape (N, C) where N is the number of samples
                and C is the number of classes and the class labels are one-hot encoded.
            x_val (numpy array or None): The validation data used only for calibrating multi-class classification
                models. It should have shape (M, C) where M is the number of validation samples and C is the number of
                classes and the class labels are one-hot encoded.
                that should be the classifier's non-probabilistic outputs.
            y_val (numpy array or None): The validation data class labels used only for calibrating multi-class
                classification models. It should have shape (M, C) where M is the number of validation samples and C
                is the number of classes and the class labels are one-hot encoded.
        """
        if not isinstance(x_train, np.ndarray) or not isinstance(y_train, np.ndarray):
            raise ValueError("x_train and y_train must be numpy arrays")

        if (x_val is not None and y_val is None) or (
            x_val is None and y_val is not None
        ):
            raise ValueError(
                "Either both x_val and y_val should be None or both should be numpy arrays."
            )

        if x_val is not None and y_val is not None:
            if not isinstance(x_val, np.ndarray) or not isinstance(y_val, np.ndarray):
                raise ValueError("x_train and y_train must be numpy arrays")

            self.early_stopping = True
            print(
                "Using Early Stopping based on performance evaluated on given validation set."
            )

        if len(x_train.shape) == 1:
            self.n_classes = 1
        else:
            self.n_classes = x_train.shape[1]

        if self.n_classes > 1:
            self._fit_temperature_scaling(x_train, y_train, x_val, y_val)
        else:
            self._fit_platt_scaling(x_train.reshape(-1, 1), y_train.reshape(-1, 1))

[docs]    def plot_training_history(self):
        """
        Helper function for plotting the training history.
        """
        fig, (ax1, ax2) = plt.subplots(1, 2, sharex=True, figsize=(12, 5))
        ax1.plot(self.history[:, 0], label="Training")
        if self.history.shape[1] == 3:  # has validation cost
            ax1.plot(self.history[:, 1], label="Validation")
        ax1.set_title("Cost")
        ax1.set_xlabel("Epoch")
        ax1.set_ylabel("Cost")
        ax1.legend(loc="upper right")
        ax2.plot(self.history[:, -1])
        ax2.set_title("Temperature")
        ax2.set_xlabel("Epoch")
        ax2.set_ylabel("Temperature")

[docs]    def predict(self, x):
        """
        This method calibrates the given data using the learned temperature. It
        scales each logit by the temperature, exponentiates the results, and finally
        normalizes the scaled values such that their sum is 1.

        Args:
            x (numpy.ndarray): The logits. For binary classification problems, it should have dimensionality (N,) where
                N is the number of samples to calibrate. For multi-class problems, it should have dimensionality (N, C)
                where C is the number of classes.

        Returns:
            numpy array: The calibrated probabilities.
        """
        if not isinstance(x, np.ndarray):
            raise ValueError(
                "x should be numpy.ndarray but received {}".format(type(x))
            )

        if len(x.shape) > 1 and x.shape[1] != self.n_classes:
            raise ValueError(
                "Expecting input vector of dimensionality {} but received {}".format(
                    self.n_classes, len(x)
                )
            )
        x_ = x

        if self.n_classes == 1:
            return self.lr.predict_proba(X=x)[:, 1].reshape(-1, 1)
        else:
            scaled_prediction = x_ / self.temperature

            return np.exp(scaled_prediction) / np.sum(
                np.exp(scaled_prediction), axis=-1, keepdims=True
            )


[docs]class IsotonicCalibration(object):
    """
    A class for applying Isotonic Calibration to the outputs of a binary or multi-class classifier.
    """

    def __init__(self):
        self.n_classes = None
        self.regressors = []

[docs]    def fit(self, x_train, y_train):
        """
        Train a calibration model using the provided data.

        Args:
            x_train (numpy array): The training data that should be the classifier's probabilistic outputs. It should
                have shape NxC where N is the number of training samples and C is the number of classes.
            y_train (numpy array): The training class labels. For binary problems y_train has shape (N,)
                when N is the number of samples. For multi-class classification, y_train has shape (N,C) where
                C is the number of classes and y_train is using one-hot encoding.

        """
        if not isinstance(x_train, np.ndarray) or not isinstance(y_train, np.ndarray):
            raise ValueError(
                "x_train and y_train should be type numpy.ndarray but received {} and {}".format(
                    type(x_train), type(y_train)
                )
            )

        if len(x_train.shape) == 1:
            self.n_classes = 1
        else:
            self.n_classes = x_train.shape[1]

        if self.n_classes == 1:
            self.regressors.append(IsotonicRegression(out_of_bounds="clip"))
            if len(x_train.shape) > 1:
                x_train = x_train.reshape(-1)
            self.regressors[-1].fit(X=x_train.astype(np.double), y=y_train)
        else:
            for n in range(self.n_classes):
                self.regressors.append(IsotonicRegression(out_of_bounds="clip"))
                self.regressors[-1].fit(
                    X=x_train[:, n].astype(np.double), y=y_train[:, n]
                )

[docs]    def predict(self, x):
        """
        This method calibrates the given data assumed the output of a classification model.

        For multi-class classification, the probabilities for each class are first scaled using the corresponding
        isotonic regression model and then normalized to sum to 1.

        Args:
            x (numpy array): The values to calibrate. For binary classification problems it should have shape (N,) where
                N is the number of samples to calibrate. For multi-class classification problems, it should have shape
                (N, C) where C is the number of classes.

        Returns:
            numpy array: The calibrated probabilities. It has shape (N, C) where N is the number of samples
            and C is the number of classes.
        """
        if not isinstance(x, np.ndarray):
            raise ValueError(
                "x should be numpy.ndarray but received {}".format(type(x))
            )

        if self.n_classes > 1 and x.shape[1] != self.n_classes:
            raise ValueError(
                "Expecting input vector of dimensionality {} but received {}".format(
                    self.n_classes, len(x)
                )
            )

        if self.n_classes == 1:
            x = x.reshape(-1, 1)

        predictions = []
        for n in range(self.n_classes):
            predictions.append(self.regressors[n].transform(T=x[:, n]))

        predictions = np.transpose(np.array(predictions))

        if self.n_classes > 1:
            predictions = predictions / np.sum(predictions, axis=-1, keepdims=True)

        return predictions