Created May 23, 2018 23:42
Reliability diagram code
import torch
import numpy as np
from matplotlib import pyplot as plt
def make_model_diagrams(outputs, labels, n_bins=10):
outputs - a torch tensor (size n x num_classes) with the outputs from the final linear layer
- NOT the softmaxes
labels - a torch tensor (size n) with the labels
softmaxes = torch.nn.functional.softmax(outputs, 1)
confidences, predictions = softmaxes.max(1)
accuracies = torch.eq(predictions, labels)
f, rel_ax = plt.subplots(1, 2, figsize=(4, 2.5))
# Reliability diagram
bins = torch.linspace(0, 1, n_bins + 1)
bins[-1] = 1.0001
width = bins[1] - bins[0]
bin_indices = [ * for bin_lower, bin_upper in zip(bins[:-1], bins[1:])]
bin_corrects = [torch.mean(accuracies[bin_index]) for bin_index in bin_indices]
bin_scores = [torch.mean(confidences[bin_index]) for bin_index in bin_scores]
confs =[:-1], bin_corrects.numpy(), width=width)
gaps =[:-1], (bin_scores - bin_corrects).numpy(), bottom=bin_corrects.numpy(), color=[1, 0.7, 0.7], alpha=0.5, width=width, hatch='//', edgecolor='r')
rel_ax.plot([0, 1], [0, 1], '--', color='gray')
rel_ax.legend([confs, gaps], ['Outputs', 'Gap'], loc='best', fontsize='small')
# Clean up
return f
LianzheWang commented Sep 17, 2021

Hi guys, I just tried this code (including the modification from @rsilveira79).
Everything went pretty okay. However, I find that I'm not able to plot the gap between confidence and accuracy.

For me, I found the problem is from the bin_corrects, which cannot directly be used correctly as the bottom parameter of the .

I made the following modification, which solves the problem for me. You may also try this if you are facing the same issue.
(with minor style changes)

def make_model_diagrams(outputs, labels, n_bins=10):
    outputs - a torch tensor (size n x num_classes) with the outputs from the final linear layer
    - NOT the softmaxes
    labels - a torch tensor (size n) with the labels
    softmaxes = torch.nn.functional.softmax(outputs, 1)
    confidences, predictions = softmaxes.max(1)
    accuracies = torch.eq(predictions, labels)
    overall_accuracy = (predictions==labels).sum().item()/len(labels)
    # Reliability diagram
    bins = torch.linspace(0, 1, n_bins + 1)
    width = 1.0 / n_bins
    bin_centers = np.linspace(0, 1.0 - width, n_bins) + width / 2
    bin_indices = [ * for bin_lower, bin_upper in zip(bins[:-1], bins[1:])]
    bin_corrects = np.array([ torch.mean(accuracies[bin_index].float()) for bin_index in bin_indices])
    bin_scores = np.array([ torch.mean(confidences[bin_index].float()) for bin_index in bin_indices])
    bin_corrects = np.nan_to_num(bin_corrects)
    bin_scores = np.nan_to_num(bin_scores)
    plt.figure(0, figsize=(8, 8))
    gap = np.array(bin_scores - bin_corrects)
    confs =, bin_corrects, color=[0, 0, 1], width=width, ec='black')
    bin_corrects = np.nan_to_num(np.array([bin_correct.cpu().numpy()  for bin_correct in bin_corrects]))
    gaps =, gap, bottom=bin_corrects, color=[1, 0.7, 0.7], alpha=0.5, width=width, hatch='//', edgecolor='r')
    plt.plot([0, 1], [0, 1], '--', color='gray')
    plt.legend([confs, gaps], ['Accuracy', 'Gap'], loc='upper left', fontsize='x-large')

    ece = _calculate_ece(outputs, labels)

    # Clean up
    bbox_props = dict(boxstyle="square", fc="lightgrey", ec="gray", lw=1.5)
    plt.text(0.17, 0.82, "ECE: {:.4f}".format(ece), ha="center", va="center", size=20, weight = 'normal', bbox=bbox_props)

    plt.title("Reliability Diagram", size=22)
    plt.ylabel("Accuracy",  size=18)
    plt.xlabel("Confidence",  size=18)
    return ece

Using the unchanged ECE calculation method provided by @rsilveira79.

def _calculate_ece(logits, labels, n_bins=10):
    Calculates the Expected Calibration Error of a model.
    (This isn't necessary for temperature scaling, just a cool metric).
    The input to this loss is the logits of a model, NOT the softmax scores.
    This divides the confidence outputs into equally-sized interval bins.
    In each bin, we compute the confidence gap:
    bin_gap = | avg_confidence_in_bin - accuracy_in_bin |
    We then return a weighted average of the gaps, based on the number
    of samples in each bin
    See: Naeini, Mahdi Pakdaman, Gregory F. Cooper, and Milos Hauskrecht.
    "Obtaining Well Calibrated Probabilities Using Bayesian Binning." AAAI.

    bin_boundaries = torch.linspace(0, 1, n_bins + 1)
    bin_lowers = bin_boundaries[:-1]
    bin_uppers = bin_boundaries[1:]

    softmaxes = F.softmax(logits, dim=1)
    confidences, predictions = torch.max(softmaxes, 1)
    accuracies = predictions.eq(labels)

    ece = torch.zeros(1, device=logits.device)
    for bin_lower, bin_upper in zip(bin_lowers, bin_uppers):
        # Calculated |confidence - accuracy| in each bin
        in_bin = * confidences.le(bin_upper.item())
        prop_in_bin = in_bin.float().mean()
        if prop_in_bin.item() > 0:
            accuracy_in_bin = accuracies[in_bin].float().mean()
            avg_confidence_in_bin = confidences[in_bin].mean()
            ece += torch.abs(avg_confidence_in_bin - accuracy_in_bin) * prop_in_bin
    return ece.item()

