-
-
Save gpleiss/0b17bc4bd118b49050056cfcd5446c71 to your computer and use it in GitHub Desktop.
import torch | |
import numpy as np | |
from matplotlib import pyplot as plt | |
def make_model_diagrams(outputs, labels, n_bins=10): | |
""" | |
outputs - a torch tensor (size n x num_classes) with the outputs from the final linear layer | |
- NOT the softmaxes | |
labels - a torch tensor (size n) with the labels | |
""" | |
softmaxes = torch.nn.functional.softmax(outputs, 1) | |
confidences, predictions = softmaxes.max(1) | |
accuracies = torch.eq(predictions, labels) | |
f, rel_ax = plt.subplots(1, 2, figsize=(4, 2.5)) | |
# Reliability diagram | |
bins = torch.linspace(0, 1, n_bins + 1) | |
bins[-1] = 1.0001 | |
width = bins[1] - bins[0] | |
bin_indices = [confidences.ge(bin_lower) * confidences.lt(bin_upper) for bin_lower, bin_upper in zip(bins[:-1], bins[1:])] | |
bin_corrects = [torch.mean(accuracies[bin_index]) for bin_index in bin_indices] | |
bin_scores = [torch.mean(confidences[bin_index]) for bin_index in bin_scores] | |
confs = rel_ax.bar(bins[:-1], bin_corrects.numpy(), width=width) | |
gaps = rel_ax.bar(bins[:-1], (bin_scores - bin_corrects).numpy(), bottom=bin_corrects.numpy(), color=[1, 0.7, 0.7], alpha=0.5, width=width, hatch='//', edgecolor='r') | |
rel_ax.plot([0, 1], [0, 1], '--', color='gray') | |
rel_ax.legend([confs, gaps], ['Outputs', 'Gap'], loc='best', fontsize='small') | |
# Clean up | |
rel_ax.set_ylabel('Accuracy') | |
rel_ax.set_xlabel('Confidence') | |
f.tight_layout() | |
return f |
Hi Guys, I did small slight modifications in the plot formula, and added the ECE calculation.
ECE Calculation - inherit from _ECELoss
Class
def calculate_ece(logits, labels, n_bins=10):
"""
Calculates the Expected Calibration Error of a model.
(This isn't necessary for temperature scaling, just a cool metric).
The input to this loss is the logits of a model, NOT the softmax scores.
This divides the confidence outputs into equally-sized interval bins.
In each bin, we compute the confidence gap:
bin_gap = | avg_confidence_in_bin - accuracy_in_bin |
We then return a weighted average of the gaps, based on the number
of samples in each bin
See: Naeini, Mahdi Pakdaman, Gregory F. Cooper, and Milos Hauskrecht.
"Obtaining Well Calibrated Probabilities Using Bayesian Binning." AAAI.
2015.
"""
bin_boundaries = torch.linspace(0, 1, n_bins + 1)
bin_lowers = bin_boundaries[:-1]
bin_uppers = bin_boundaries[1:]
softmaxes = F.softmax(logits, dim=1)
confidences, predictions = torch.max(softmaxes, 1)
accuracies = predictions.eq(labels)
ece = torch.zeros(1, device=logits.device)
for bin_lower, bin_upper in zip(bin_lowers, bin_uppers):
# Calculated |confidence - accuracy| in each bin
in_bin = confidences.gt(bin_lower.item()) * confidences.le(bin_upper.item())
prop_in_bin = in_bin.float().mean()
if prop_in_bin.item() > 0:
accuracy_in_bin = accuracies[in_bin].float().mean()
avg_confidence_in_bin = confidences[in_bin].mean()
ece += torch.abs(avg_confidence_in_bin - accuracy_in_bin) * prop_in_bin
return ece.item()
Reliability Diagram
def make_model_diagrams(outputs, labels, n_bins=10):
"""
outputs - a torch tensor (size n x num_classes) with the outputs from the final linear layer
- NOT the softmaxes
labels - a torch tensor (size n) with the labels
"""
softmaxes = torch.nn.functional.softmax(outputs, 1)
confidences, predictions = softmaxes.max(1)
accuracies = torch.eq(predictions, labels)
overall_accuracy = (predictions==labels).sum().item()/len(labels)
# Reliability diagram
bins = torch.linspace(0, 1, n_bins + 1)
width = 1.0 / n_bins
bin_centers = np.linspace(0, 1.0 - width, n_bins) + width / 2
bin_indices = [confidences.ge(bin_lower) * confidences.lt(bin_upper) for bin_lower, bin_upper in zip(bins[:-1], bins[1:])]
bin_corrects = np.array([ torch.mean(accuracies[bin_index].float()) for bin_index in bin_indices])
bin_scores = np.array([ torch.mean(confidences[bin_index].float()) for bin_index in bin_indices])
plt.figure(0, figsize=(8, 8))
gap = (bin_scores - bin_corrects)
confs = plt.bar(bin_centers, bin_corrects, width=width, alpha=0.1, ec='black')
gaps = plt.bar(bin_centers, (bin_scores - bin_corrects), bottom=bin_corrects, color=[1, 0.7, 0.7], alpha=0.5, width=width, hatch='//', edgecolor='r')
plt.plot([0, 1], [0, 1], '--', color='gray')
plt.legend([confs, gaps], ['Outputs', 'Gap'], loc='best', fontsize='small')
ece = calculate_ece(outputs, labels)
# Clean up
bbox_props = dict(boxstyle="round", fc="lightgrey", ec="brown", lw=2)
plt.text(0.2, 0.85, "ECE: {:.2f}".format(ece), ha="center", va="center", size=20, weight = 'bold', bbox=bbox_props)
plt.title("Reliability Diagram", size=20)
plt.ylabel("Accuracy (P[y]", size=18)
plt.xlabel("Confidence", size=18)
plt.xlim(0,1)
plt.ylim(0,1)
return ece
@rsilveira79 if you want to make this a PR, please do!
The result of accuracies = torch.eq(predictions, labels)
is a boolean tensor. How in line bin_corrects = np.array([ torch.mean(accuracies[bin_index].float()) for bin_index in bin_indices])
, we are calculating mean of a boolean tensor?
I am getting this specific error RuntimeError: Can only calculate the mean of floating types. Got Bool instead.
Do you have any idea how should I resolve that? Thank you.
accuracies[bin_index].float()
should create a floating point tensor, so I'm confused if this line is causing the error.
Hi guys, I just tried this code (including the modification from @rsilveira79).
Everything went pretty okay. However, I find that I'm not able to plot the gap between confidence and accuracy.
For me, I found the problem is from the bin_corrects, which cannot directly be used correctly as the bottom parameter of the plt.bar() .
I made the following modification, which solves the problem for me. You may also try this if you are facing the same issue.
(with minor style changes)
def make_model_diagrams(outputs, labels, n_bins=10):
"""
outputs - a torch tensor (size n x num_classes) with the outputs from the final linear layer
- NOT the softmaxes
labels - a torch tensor (size n) with the labels
"""
softmaxes = torch.nn.functional.softmax(outputs, 1)
confidences, predictions = softmaxes.max(1)
accuracies = torch.eq(predictions, labels)
overall_accuracy = (predictions==labels).sum().item()/len(labels)
# Reliability diagram
bins = torch.linspace(0, 1, n_bins + 1)
width = 1.0 / n_bins
bin_centers = np.linspace(0, 1.0 - width, n_bins) + width / 2
bin_indices = [confidences.ge(bin_lower) * confidences.lt(bin_upper) for bin_lower, bin_upper in zip(bins[:-1], bins[1:])]
bin_corrects = np.array([ torch.mean(accuracies[bin_index].float()) for bin_index in bin_indices])
bin_scores = np.array([ torch.mean(confidences[bin_index].float()) for bin_index in bin_indices])
bin_corrects = np.nan_to_num(bin_corrects)
bin_scores = np.nan_to_num(bin_scores)
plt.figure(0, figsize=(8, 8))
gap = np.array(bin_scores - bin_corrects)
confs = plt.bar(bin_centers, bin_corrects, color=[0, 0, 1], width=width, ec='black')
bin_corrects = np.nan_to_num(np.array([bin_correct.cpu().numpy() for bin_correct in bin_corrects]))
gaps = plt.bar(bin_centers, gap, bottom=bin_corrects, color=[1, 0.7, 0.7], alpha=0.5, width=width, hatch='//', edgecolor='r')
plt.plot([0, 1], [0, 1], '--', color='gray')
plt.legend([confs, gaps], ['Accuracy', 'Gap'], loc='upper left', fontsize='x-large')
ece = _calculate_ece(outputs, labels)
# Clean up
bbox_props = dict(boxstyle="square", fc="lightgrey", ec="gray", lw=1.5)
plt.text(0.17, 0.82, "ECE: {:.4f}".format(ece), ha="center", va="center", size=20, weight = 'normal', bbox=bbox_props)
plt.title("Reliability Diagram", size=22)
plt.ylabel("Accuracy", size=18)
plt.xlabel("Confidence", size=18)
plt.xlim(0,1)
plt.ylim(0,1)
plt.savefig('reliability_diagram.png')
plt.show()
return ece
Using the unchanged ECE calculation method provided by @rsilveira79.
def _calculate_ece(logits, labels, n_bins=10):
"""
Calculates the Expected Calibration Error of a model.
(This isn't necessary for temperature scaling, just a cool metric).
The input to this loss is the logits of a model, NOT the softmax scores.
This divides the confidence outputs into equally-sized interval bins.
In each bin, we compute the confidence gap:
bin_gap = | avg_confidence_in_bin - accuracy_in_bin |
We then return a weighted average of the gaps, based on the number
of samples in each bin
See: Naeini, Mahdi Pakdaman, Gregory F. Cooper, and Milos Hauskrecht.
"Obtaining Well Calibrated Probabilities Using Bayesian Binning." AAAI.
2015.
"""
bin_boundaries = torch.linspace(0, 1, n_bins + 1)
bin_lowers = bin_boundaries[:-1]
bin_uppers = bin_boundaries[1:]
softmaxes = F.softmax(logits, dim=1)
confidences, predictions = torch.max(softmaxes, 1)
accuracies = predictions.eq(labels)
ece = torch.zeros(1, device=logits.device)
for bin_lower, bin_upper in zip(bin_lowers, bin_uppers):
# Calculated |confidence - accuracy| in each bin
in_bin = confidences.gt(bin_lower.item()) * confidences.le(bin_upper.item())
prop_in_bin = in_bin.float().mean()
if prop_in_bin.item() > 0:
accuracy_in_bin = accuracies[in_bin].float().mean()
avg_confidence_in_bin = confidences[in_bin].mean()
ece += torch.abs(avg_confidence_in_bin - accuracy_in_bin) * prop_in_bin
return ece.item()
Is line 22 correct?
bin_scores
is never defined before that line. I believe that should bebin_indices
? But thenbin_scores
becomes a list of tensors which causes line 25 to throw an error.