Skip to content

Instantly share code, notes, and snippets.

@gpleiss
Created May 23, 2018 23:42
Show Gist options
  • Star 8 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save gpleiss/0b17bc4bd118b49050056cfcd5446c71 to your computer and use it in GitHub Desktop.
Save gpleiss/0b17bc4bd118b49050056cfcd5446c71 to your computer and use it in GitHub Desktop.
Reliability diagram code
import torch
import numpy as np
from matplotlib import pyplot as plt
def make_model_diagrams(outputs, labels, n_bins=10):
"""
outputs - a torch tensor (size n x num_classes) with the outputs from the final linear layer
- NOT the softmaxes
labels - a torch tensor (size n) with the labels
"""
softmaxes = torch.nn.functional.softmax(outputs, 1)
confidences, predictions = softmaxes.max(1)
accuracies = torch.eq(predictions, labels)
f, rel_ax = plt.subplots(1, 2, figsize=(4, 2.5))
# Reliability diagram
bins = torch.linspace(0, 1, n_bins + 1)
bins[-1] = 1.0001
width = bins[1] - bins[0]
bin_indices = [confidences.ge(bin_lower) * confidences.lt(bin_upper) for bin_lower, bin_upper in zip(bins[:-1], bins[1:])]
bin_corrects = [torch.mean(accuracies[bin_index]) for bin_index in bin_indices]
bin_scores = [torch.mean(confidences[bin_index]) for bin_index in bin_scores]
confs = rel_ax.bar(bins[:-1], bin_corrects.numpy(), width=width)
gaps = rel_ax.bar(bins[:-1], (bin_scores - bin_corrects).numpy(), bottom=bin_corrects.numpy(), color=[1, 0.7, 0.7], alpha=0.5, width=width, hatch='//', edgecolor='r')
rel_ax.plot([0, 1], [0, 1], '--', color='gray')
rel_ax.legend([confs, gaps], ['Outputs', 'Gap'], loc='best', fontsize='small')
# Clean up
rel_ax.set_ylabel('Accuracy')
rel_ax.set_xlabel('Confidence')
f.tight_layout()
return f
@kirk86
Copy link

kirk86 commented Nov 12, 2018

Is line 22 correct? bin_scores is never defined before that line. I believe that should be bin_indices? But then bin_scores becomes a list of tensors which causes line 25 to throw an error.

@rsilveira79
Copy link

rsilveira79 commented Aug 6, 2020

Hi Guys, I did small slight modifications in the plot formula, and added the ECE calculation.

ECE Calculation - inherit from _ECELoss Class

def calculate_ece(logits, labels, n_bins=10):
    """
    Calculates the Expected Calibration Error of a model.
    (This isn't necessary for temperature scaling, just a cool metric).
    The input to this loss is the logits of a model, NOT the softmax scores.
    This divides the confidence outputs into equally-sized interval bins.
    In each bin, we compute the confidence gap:
    bin_gap = | avg_confidence_in_bin - accuracy_in_bin |
    We then return a weighted average of the gaps, based on the number
    of samples in each bin
    See: Naeini, Mahdi Pakdaman, Gregory F. Cooper, and Milos Hauskrecht.
    "Obtaining Well Calibrated Probabilities Using Bayesian Binning." AAAI.
    2015.
    """

    bin_boundaries = torch.linspace(0, 1, n_bins + 1)
    bin_lowers = bin_boundaries[:-1]
    bin_uppers = bin_boundaries[1:]

    softmaxes = F.softmax(logits, dim=1)
    confidences, predictions = torch.max(softmaxes, 1)
    accuracies = predictions.eq(labels)

    ece = torch.zeros(1, device=logits.device)
    for bin_lower, bin_upper in zip(bin_lowers, bin_uppers):
        # Calculated |confidence - accuracy| in each bin
        in_bin = confidences.gt(bin_lower.item()) * confidences.le(bin_upper.item())
        prop_in_bin = in_bin.float().mean()
        if prop_in_bin.item() > 0:
            accuracy_in_bin = accuracies[in_bin].float().mean()
            avg_confidence_in_bin = confidences[in_bin].mean()
            ece += torch.abs(avg_confidence_in_bin - accuracy_in_bin) * prop_in_bin
    return ece.item()

Reliability Diagram

def make_model_diagrams(outputs, labels, n_bins=10):
    """
    outputs - a torch tensor (size n x num_classes) with the outputs from the final linear layer
    - NOT the softmaxes
    labels - a torch tensor (size n) with the labels
    """
    softmaxes = torch.nn.functional.softmax(outputs, 1)
    confidences, predictions = softmaxes.max(1)
    accuracies = torch.eq(predictions, labels)
    overall_accuracy = (predictions==labels).sum().item()/len(labels)
    
    # Reliability diagram
    bins = torch.linspace(0, 1, n_bins + 1)
    width = 1.0 / n_bins
    bin_centers = np.linspace(0, 1.0 - width, n_bins) + width / 2
    bin_indices = [confidences.ge(bin_lower) * confidences.lt(bin_upper) for bin_lower, bin_upper in zip(bins[:-1], bins[1:])]
    
    bin_corrects = np.array([ torch.mean(accuracies[bin_index].float()) for bin_index in bin_indices])
    bin_scores = np.array([ torch.mean(confidences[bin_index].float()) for bin_index in bin_indices])
     
    plt.figure(0, figsize=(8, 8))
    gap = (bin_scores - bin_corrects)
    confs = plt.bar(bin_centers, bin_corrects, width=width, alpha=0.1, ec='black')
    gaps = plt.bar(bin_centers, (bin_scores - bin_corrects), bottom=bin_corrects, color=[1, 0.7, 0.7], alpha=0.5, width=width, hatch='//', edgecolor='r')
    plt.plot([0, 1], [0, 1], '--', color='gray')
    plt.legend([confs, gaps], ['Outputs', 'Gap'], loc='best', fontsize='small')

    ece = calculate_ece(outputs, labels)

    # Clean up
    bbox_props = dict(boxstyle="round", fc="lightgrey", ec="brown", lw=2)
    plt.text(0.2, 0.85, "ECE: {:.2f}".format(ece), ha="center", va="center", size=20, weight = 'bold', bbox=bbox_props)

    plt.title("Reliability Diagram", size=20)
    plt.ylabel("Accuracy (P[y]",  size=18)
    plt.xlabel("Confidence",  size=18)
    plt.xlim(0,1)
    plt.ylim(0,1)
    return ece

@gpleiss
Copy link
Author

gpleiss commented Aug 14, 2020

@rsilveira79 if you want to make this a PR, please do!

@Mahhos
Copy link

Mahhos commented May 1, 2021

The result of accuracies = torch.eq(predictions, labels) is a boolean tensor. How in line bin_corrects = np.array([ torch.mean(accuracies[bin_index].float()) for bin_index in bin_indices]), we are calculating mean of a boolean tensor?
I am getting this specific error RuntimeError: Can only calculate the mean of floating types. Got Bool instead.
Do you have any idea how should I resolve that? Thank you.

@gpleiss
Copy link
Author

gpleiss commented May 3, 2021

accuracies[bin_index].float() should create a floating point tensor, so I'm confused if this line is causing the error.

@LianzheWang
Copy link

LianzheWang commented Sep 17, 2021

Hi guys, I just tried this code (including the modification from @rsilveira79).
Everything went pretty okay. However, I find that I'm not able to plot the gap between confidence and accuracy.

For me, I found the problem is from the bin_corrects, which cannot directly be used correctly as the bottom parameter of the plt.bar() .

I made the following modification, which solves the problem for me. You may also try this if you are facing the same issue.
(with minor style changes)

def make_model_diagrams(outputs, labels, n_bins=10):
    """
    outputs - a torch tensor (size n x num_classes) with the outputs from the final linear layer
    - NOT the softmaxes
    labels - a torch tensor (size n) with the labels
    """
    softmaxes = torch.nn.functional.softmax(outputs, 1)
    confidences, predictions = softmaxes.max(1)
    accuracies = torch.eq(predictions, labels)
    overall_accuracy = (predictions==labels).sum().item()/len(labels)
    
    # Reliability diagram
    bins = torch.linspace(0, 1, n_bins + 1)
    width = 1.0 / n_bins
    bin_centers = np.linspace(0, 1.0 - width, n_bins) + width / 2
    bin_indices = [confidences.ge(bin_lower) * confidences.lt(bin_upper) for bin_lower, bin_upper in zip(bins[:-1], bins[1:])]
    
    bin_corrects = np.array([ torch.mean(accuracies[bin_index].float()) for bin_index in bin_indices])
    bin_scores = np.array([ torch.mean(confidences[bin_index].float()) for bin_index in bin_indices])
    bin_corrects = np.nan_to_num(bin_corrects)
    bin_scores = np.nan_to_num(bin_scores)
    
    plt.figure(0, figsize=(8, 8))
    gap = np.array(bin_scores - bin_corrects)
    
    confs = plt.bar(bin_centers, bin_corrects, color=[0, 0, 1], width=width, ec='black')
    bin_corrects = np.nan_to_num(np.array([bin_correct.cpu().numpy()  for bin_correct in bin_corrects]))
    gaps = plt.bar(bin_centers, gap, bottom=bin_corrects, color=[1, 0.7, 0.7], alpha=0.5, width=width, hatch='//', edgecolor='r')
    
    plt.plot([0, 1], [0, 1], '--', color='gray')
    plt.legend([confs, gaps], ['Accuracy', 'Gap'], loc='upper left', fontsize='x-large')

    ece = _calculate_ece(outputs, labels)

    # Clean up
    bbox_props = dict(boxstyle="square", fc="lightgrey", ec="gray", lw=1.5)
    plt.text(0.17, 0.82, "ECE: {:.4f}".format(ece), ha="center", va="center", size=20, weight = 'normal', bbox=bbox_props)

    plt.title("Reliability Diagram", size=22)
    plt.ylabel("Accuracy",  size=18)
    plt.xlabel("Confidence",  size=18)
    plt.xlim(0,1)
    plt.ylim(0,1)
    plt.savefig('reliability_diagram.png')
    plt.show()
    return ece

Using the unchanged ECE calculation method provided by @rsilveira79.

def _calculate_ece(logits, labels, n_bins=10):
    """
    Calculates the Expected Calibration Error of a model.
    (This isn't necessary for temperature scaling, just a cool metric).
    The input to this loss is the logits of a model, NOT the softmax scores.
    This divides the confidence outputs into equally-sized interval bins.
    In each bin, we compute the confidence gap:
    bin_gap = | avg_confidence_in_bin - accuracy_in_bin |
    We then return a weighted average of the gaps, based on the number
    of samples in each bin
    See: Naeini, Mahdi Pakdaman, Gregory F. Cooper, and Milos Hauskrecht.
    "Obtaining Well Calibrated Probabilities Using Bayesian Binning." AAAI.
    2015.
    """

    bin_boundaries = torch.linspace(0, 1, n_bins + 1)
    bin_lowers = bin_boundaries[:-1]
    bin_uppers = bin_boundaries[1:]

    softmaxes = F.softmax(logits, dim=1)
    confidences, predictions = torch.max(softmaxes, 1)
    accuracies = predictions.eq(labels)

    ece = torch.zeros(1, device=logits.device)
    for bin_lower, bin_upper in zip(bin_lowers, bin_uppers):
        # Calculated |confidence - accuracy| in each bin
        in_bin = confidences.gt(bin_lower.item()) * confidences.le(bin_upper.item())
        prop_in_bin = in_bin.float().mean()
        if prop_in_bin.item() > 0:
            accuracy_in_bin = accuracies[in_bin].float().mean()
            avg_confidence_in_bin = confidences[in_bin].mean()
            ece += torch.abs(avg_confidence_in_bin - accuracy_in_bin) * prop_in_bin
    return ece.item()

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment