ssss1029/gist:603ae3d83c25cf61ce303fb41a463d04

## gistfile1.txt
\documentclass{article}

% if you need to pass options to natbib, use, e.g.:
%     \PassOptionsToPackage{numbers, compress}{natbib}
% before loading neurips_2019

% ready for submission
% \usepackage{neurips_2019}

% to compile a preprint version, e.g., for submission to arXiv, add add the
% [preprint] option:
\usepackage[final]{neurips_2019}

% to compile a camera-ready version, add the [final] option, e.g.:
%  \usepackage{neurips_2019}

% to avoid loading the natbib package, add option nonatbib:
%     \usepackage[nonatbib]{neurips_2019}

\usepackage[compact]{titlesec}
\usepackage[utf8]{inputenc} % allow utf-8 input
\usepackage[T1]{fontenc}    % use 8-bit T1 fonts

\usepackage[breaklinks=true,colorlinks,citecolor=black,bookmarks=false]{hyperref}
\hypersetup{
	pdfinfo={
		Title={Using Self-Supervised Learning Can Improve Model Robustness and Uncertainty},
		Author={Dan Hendrycks and Mantas Mazeika and Saurav Kadavath and Dawn Song},
		Subject={Self-Supervised, Adversarial Examples, Out of distribution, Robustness, Uncertainty},
	}
}

\usepackage{url}            % simple URL typesetting
\usepackage{booktabs}       % professional-quality tables
\usepackage{amsfonts}       % blackboard math symbols
\usepackage{nicefrac}       % compact symbols for 1/2, etc.
\usepackage{microtype}      % microtypography

% added packages
\usepackage{subcaption}
\usepackage{textcomp}
\usepackage{gensymb}
\usepackage{mathtools}
\usepackage{amssymb}
\usepackage{wrapfig}
\usepackage{lipsum}
\usepackage{diagbox}
\usepackage{booktabs}
\usepackage{stfloats}
\usepackage{multirow}
\usepackage{tabularx}
\newcolumntype{Y}{>{\centering\arraybackslash}X}
\usepackage{multirow}


\usepackage{appendix}
\usepackage{cleveref}
\crefname{appsec}{Appendix}{Appendices}

\usepackage{arydshln}
\usepackage{makecell}
\usepackage{dsfont}

\usepackage{color, colortbl}
\definecolor{neonpurple}{rgb}{0.3,0,1}
\newcommand{\dan}[1]{{\textcolor{neonpurple}{[Dan: #1]}}}

\makeatletter
\newcommand{\printfnsymbol}[1]{%
  \textsuperscript{\@fnsymbol{#1}}%
}
\makeatother

\title{Using Self-Supervised Learning Can Improve Model Robustness and Uncertainty}

% The \author macro works with any number of authors. There are two commands
% used to separate the names and addresses of multiple authors: \And and \AND.
%
% Using \And between authors leaves it to LaTeX to determine where to break the
% lines. Using \AND forces a line break at that point. So, if LaTeX puts 3 of 4
% authors names on the first line, and the last on the second line, try using
% \AND instead of \And before the third author name.

\author{%
  Dan Hendrycks \\ UC Berkeley \\
  \texttt{hendrycks@berkeley.edu} \\
  \And
  Mantas Mazeika\thanks{Equal Contribution.} \\ UIUC \\
  \texttt{mantas3@illinois.edu} \\
  \And
  Saurav Kadavath\printfnsymbol{1} \\ UC Berkeley \\
  \texttt{sauravkadavath@berkeley.edu} \\
  \And
  Dawn Song \\ UC Berkeley \\
  \texttt{dawnsong@berkeley.edu} \\
}

\begin{document}

\maketitle

\begin{abstract}

Self-supervision provides effective representations for downstream tasks without requiring labels. However, existing approaches lag behind fully supervised training and are often not thought beneficial beyond obviating the need for annotations. We find that self-supervision can benefit robustness in a variety of ways, including robustness to adversarial examples, label corruption, and common input corruptions. Additionally, self-supervision greatly benefits out-of-distribution detection on difficult, near-distribution outliers, so much so that it exceeds the performance of fully supervised methods. These results demonstrate the promise of self-supervision for improving robustness and uncertainty estimation and establish these tasks as new axes of evaluation for future self-supervised learning research.

\end{abstract}

\section{Introduction}

% Most self-supervised research is seen as catching up to supervised performance. Training with self-supervision in conjunction with supervised losses does not substantially impact classification accuracy. However, we find that it does improve robustness and uncertainty. No additional assumptions are required to improve performance. Just a different loss.

% Conjunction with instead of catching up.
% Actually can help in various unappreciated ways but (again) masked by looking solely at clean accuracy.

% Surpass supervised performance in OOD detection.

% Robustness and uncertainty evaluations provide a novel axis of improvement for self-supervised methods.


% Subclass of unsupervised learning. Set up tasks where you predict something about the input.
% Strong representations compared to standard unsupervised density estimation and reconstruction methods. Useful in robotics and video as well as non-vision tasks.
% Prior work on unsupervised learning is motivated by prospect of training on large, unlabeled datasets, and tuning on potentially smaller labeled datasets. Typical evaluations train a linear layer or small fully-connected layer on top of the self-supervised representations, and performance is lower than fully-supervised methods trained on the target task in the case of ImageNet classification (emphasis man).
% They have only been evaluated on typical accuracy-based metrics, and self-supervision does not improve accuracy very much when used in conjunction with fully-supervised learning, as the supervision provides a stronger signal for separating the data.
% We show that even though standard accuracy is not affected by self-supervision, self-supervision can improve various facets of model robustness. Moreover, we find that self-supervision greatly improves out-of-distribution detection for difficult, near-distribution examples, so much so that it outperforms supervised methods.
% Thus, we find that self-supervised learning has value beyond enabling unsupervised representation learning by not requiring labels. In particular, it enforces priors on models that improve robustness and out-of-distribution detection. List tasks. We improve performance on all of them. Importantly, these gains are masked if one looks at clean accuracy alone, for which performance stays constant.
% These results suggest robustness and uncertainty tasks as additional axes of evaluation for self-supervised and unsupervised representation learning. Importantly, these methods can improve robustness and uncertainty estimation without requiring larger models or additional data. They can be added to existing methods for additive effect with no additional assumptions.
% We identify effective approaches to improve out-of-distribution detection and model robustness.

% robotics
% \cite{grasp2vec, time_contrastive_nets, robustness_retrying}

% video
% \cite{Owens2018AudioVisualSA, Wiles18a, Vondrick_2018_ECCV}

% we want to emphasize that rotnet is particularly strong

Self-supervised learning holds great promise for improving representations when labeled data are scarce. In semi-supervised learning, recent self-supervision methods are state-of-the-art \citep{rotnet, exemplar_nets, S4L}, and self-supervision is essential in video tasks where annotation is costly \citep{Vondrick_2018_ECCV, Vondrick_2016}. To date, however, self-supervised approaches lag behind fully supervised training on standard accuracy metrics and research has existed in a mode of catching up to supervised performance. Additionally, when used in conjunction with fully supervised learning on a fully labeled dataset, self-supervision has little impact on accuracy. This raises the question of whether large labeled datasets render self-supervision needless.

We show that while self-supervision does not substantially improve accuracy when used in tandem with standard training on fully labeled datasets, it can improve several aspects of model robustness, including robustness to adversarial examples \citep{madry}, label corruptions \citep{Patrini, noise_label_overfitting}, and common input corruptions such as fog, snow, and blur \citep{hendrycks2019robustness}. Importantly, these gains are masked if one looks at clean accuracy alone, for which performance stays constant. Moreover, we find that self-supervision greatly improves out-of-distribution detection for difficult, near-distribution examples, a long-standing and underexplored problem. In fact, using self-supervised learning techniques on CIFAR-10 and ImageNet, we are even able to \emph{surpass fully supervised methods}.

These results demonstrate that self-supervision need not be viewed as a collection of techniques allowing models to catch up to full supervision. Rather, using the two in conjunction provides strong regularization that improves robustness and uncertainty estimation even if clean accuracy does not change. Importantly, these methods can improve robustness and uncertainty estimation without requiring larger models or additional data \citep{madrydata, kurakin}. They can be used with task-specific methods for additive effect with no additional assumptions. With self-supervised learning, we make tangible progress on adversarial robustness, label corruption, common input corruptions, and out-of-distribution detection, suggesting that future self-supervised learning methods could also be judged by their utility for uncertainty estimates and model robustness. Code and our expanded ImageNet validation dataset are available at \href{https://github.com/hendrycks/ss-ood}{\texttt{https://github.com/hendrycks/ss-ood}}.

\section{Related Work}


\begin{wrapfigure}{r}{0.5\textwidth}
    \vspace{-10pt}
    \includegraphics[width=0.95\linewidth]{figures/zebra-texture-flip.jpg}
    \caption{Predicting rotation requires modeling shape. Texture alone is not sufficient for determining whether the zebra is flipped, although it may be sufficient for classification under ideal conditions. Thus, training with self-supervised auxiliary rotations may improve robustness.}
    \label{fig:zebrafig}
    \vspace{-5pt}
\end{wrapfigure}


\textbf{Self-supervised learning.}\quad
A number of self-supervised methods have been proposed, each exploring a different pretext task. \citet{relative_position} predict the relative position of image patches and use the resulting representation to improve object detection. \citet{exemplar_nets} create surrogate classes to train on by transforming seed image patches. Similarly, \citet{rotnet} predict image rotations. Other approaches include using colorization as a proxy task \citep{gustav_colorization}, deep clustering methods \citep{IID}, and methods that maximize mutual information \citep{deepinfomax} with high-level representations \citep{cpc, hnaff2019dataefficient}. These works focus on the utility of self-supervision for learning without labeled data and do not consider its effect on robustness and uncertainty estimation.

\textbf{Robustness.}\quad
Improving model robustness refers to the goal of ensuring machine learning models are resistant across a variety of imperfect training and testing conditions. \citet{hendrycks2019robustness} look at how models can handle common real-world image corruptions (such as fog, blur, and JPEG compression) and propose a comprehensive set of distortions to evaluate real-world robustness. Another robustness problem is learning in the presence of corrupted labels \citep{Nettleton, Patrini}. To this end, \citet{hendrycks2018glc} introduce Gold Loss Correction (GLC), a method that uses a small set of trusted labels to improve accuracy in this setting. With high degrees of label corruption, models start to overfit the misinformation in the corrupted labels \citep{noise_label_overfitting}, suggesting a need for ways to supplement training with reliable signals from unsupervised objectives. \citet{madry} explore adversarial robustness and propose PGD adversarial training, where models are trained with a minimax robust optimization objective. \citet{Zhang2019theoretically} improve upon this work with a modified loss function and develop a better understanding of the trade-off between adversarial accuracy and natural accuracy.

\textbf{Out-of-distribution detection.}\quad
Out-of-distribution detection has a long history. Traditional methods such as one-class SVMs \citep{OC-SVM} have been revisited with deep representations \citep{DeepSVDD}, yielding improvements on complex data. A central line of recent exploration has been with out-of-distribution detectors using supervised representations. \citet{hendrycks17baseline} propose using the maximum softmax probability of a classifier for out-of-distribution detection. \citet{kimin} expand on this by generating synthetic outliers and training the representations to flag these examples as outliers. However, \citet{outlier_exposure} find that training against a large and diverse dataset of outliers enables far better out-of-distribution detection on unseen distributions. In these works, detection is most difficult for near-distribution outliers, which suggests a need for new methods that force the model to learn more about the structure of in-distribution examples.

\section{Robustness}

\subsection{Robustness to Adversarial Perturbations}\label{section:adv}

%As neural networks are increasingly deployed in safety-critical real-world systems, it becomes imperative to ensure they are not just effective against natural inputs, but can also handle intentionally malicious data.
Improving robustness to adversarial inputs has proven difficult, with adversarial training providing the only longstanding gains \citep{bypass, obfuscated_gradients}. In this section, we demonstrate that auxiliary self-supervision in the form of predicting rotations \citep{rotnet} can improve upon standard Projected Gradient Descent (PGD) adversarial training \citep{madry}. We also observe that auxiliary self-supervision can provide gains when complemented with stronger defenses like TRADES \citep{Zhang2019theoretically} and is not broken by gradient-free attacks such as SPSA \citep{uesato2018adversarial}.

\begin{table*}[t]
\begin{center}
\begin{tabular}{lccc}
\toprule
                            & Clean & 20-step PGD & 100-step PGD \\ \midrule
Normal Training             & 94.8  & 0.0 & 0.0 \\
% Adversarial Training        & 87.3  & 45.8 \\
Adversarial Training        & 84.2  & 44.8 & 44.8 \\
+ Auxiliary Rotations (Ours)   & 83.5 & 50.4 & 50.4 \\
\bottomrule
\end{tabular}
\end{center}
\caption{Results for our defense. All results use $\varepsilon=8.0/255$. For 20-step adversaries $\alpha=2.0/255$, and for 100-step adversaries $\alpha=0.3/255$. More steps do not change results, so the attacks converge. Self-supervision through rotations provides large gains over standard adversarial training.\looseness=-1}
% \vspace{-10pt}
\label{tab:advresults}
\end{table*}

\textbf{Setup.}\quad
The problem of defending against bounded adversarial perturbations can be formally expressed as finding model parameters $\theta$ for the classifier $p$ that minimize the objective
\begin{equation}
\begin{matrix}
\min_{\theta} \mathbb{E}_{(x, y) \sim \mathcal{D}} \left[ \max_{x' \in S} \mathcal{L}_\text{CE}(y, p(y\mid x'); \theta)  \right ]
 &
\textup{where}
 &
 S = \{x':\left \| x - x' \right \| < \varepsilon\}
\end{matrix}
\end{equation}
In this paper, we focus on $\ell_\infty$ norm bounded adversaries. \citet{madry} propose that PGD is ``a universal first-order adversary.'' Hence, we first focus on defending against PGD. Let $\textup{PGD}_K(x) = x^K$ be the $K^{\text{th}}$ step of PGD, where
\begin{equation}
\begin{matrix}
x^{k+1} = \Pi_{S} \left( x^k + \alpha \textup{ sign}(\nabla_{x} \mathcal{L}_\text{CE}(y, p(y\mid x^k); \theta)) \right)
&
\textup{and}
&
x^0 = x + U(-\delta, \delta).
\end{matrix}
\end{equation}

$K$ is a preset parameter which characterizes the number of steps that are taken, $\Pi_S$ is the projection operator for the $l_\infty$ ball $S$, and $\mathcal{L}_\text{CE}(y, p(y\mid x^k); \theta))$ is the loss we want the PGD attacker to maximize. Normally, this loss is the cross entropy between the model's softmax classification output for $x$ and the ground truth label $y$. For evaluating robust accuracy, we use 20-step and 100-step adversaries. For the 20-step adversary, we set the step-size $\alpha=2/256$. For the 100-step adversary, we set $\alpha=0.3/256$ as in \citep{madry}. During training, we use 10-step adversaries with $\alpha=2/256$.

\begin{wrapfigure}{r}{0.5\textwidth}
    \vspace{-5pt}
    \includegraphics[width=0.95\linewidth]{figures/eps.pdf}
    \caption{The effect of attack strength on a $\varepsilon=8/255$ adversarially trained model. The attack strengths are $\varepsilon \in \{ 4/255, 5/255, \ldots, 10/255 \}$. Clean accuracy does not change when applying self-supervision, and hence self-supervision's benefits are masked when observing the clean accuracy alone.}
    \label{fig:advresultsfig}
    \vspace{-5pt}
\end{wrapfigure}

In all experiments, we use 40-2 Wide Residual Networks \cite{wideresnet}. For training, we use SGD with Nesterov momentum of 0.9 and a batch size of 128. We use an initial learning rate of 0.1 and a cosine learning rate schedule \cite{sgdr} and weight decay of $5\times 10^{-4}$. For data augmentation, we use random cropping and mirroring. Hyperparameters were chosen as standard values and are used in subsequent sections unless otherwise specified.


% By setting PGD as the adversary in adversarial training, we get PGD training.
% Concretely, the SGD update equation for one batch $\{ x_1, x_2, \ldots, x_b \}$ during PGD training is:
% \begin{equation}
% \begin{matrix}
% \theta = \theta - \eta \left( \frac{1}{b} \sum_{i=1}^b \nabla_\theta L(\theta, \textup{PGD}(x_i), y) \right )
% \end{matrix}
% \end{equation}

% Explain adversarial examples problem. Most proposed defenses fail. Adversarial training is an exception. Describe adversarial training with words and some math. Prior work: larger models and more data improve robust accuracy.

\textbf{Method.}\quad
We explore improving representation robustness beyond standard PGD training with auxiliary rotation-based self-supervision in the style of \cite{rotnet}. In our approach, we train a classification network along with a separate auxiliary head, which takes the penultimate vector from the network as input and outputs a 4-way softmax distribution. This head is trained along with the rest of the network to predict the amount of rotation applied to a given input image (from 0\degree, 90\degree, 180\degree, and 270\degree). Our overall loss during training can be broken down into a supervised loss and a self-supervised loss
\begin{equation}
\mathcal{L} (x, y; \theta) = \mathcal{L}_{\textup{CE}} (y, p(y \mid \textup{PGD}(x)); \theta) + \lambda \mathcal{L}_{\textup{SS}} (\textup{PGD}(x); \theta).
\end{equation}
Note that the self-supervised component of the loss does not require the ground truth training label $y$ as input. The supervised loss does not make use of our auxiliary head, while the self-supervised loss only makes use of this head. However, both losses use exactly the same network up until the penultimate vector. When $\lambda = 0$, our total loss falls back to the loss used in standard PGD training. For our experiments, we use $\lambda = 0.5$ and the following rotation-based self-supervised loss
\begin{equation}
% \mathcal{L}_{\textup{SS}} = \frac{1}{4} \bigg( \mathcal{L}_{ROT}(\theta, R_{0}(x), 0^{\circ}) + \mathcal{L}_{ROT}(\theta, R_{90}(x), 90^{\circ}) + \mathcal{L}_{ROT}(\theta, R_{180}(x),180^{\circ}) + \mathcal{L}_{ROT}(\theta, R_{270}(x), 270^{\circ}) \bigg) \\
\mathcal{L}_{\textup{SS}}(x; \theta) = \frac{1}{4} \left[ \sum_{  r \in \{ 0^{\circ}, 90^{\circ}, 180^{\circ}, 270^{\circ} \}  } \mathcal{L}_{\textup{CE}}(\texttt{one\_hot}(r), p_{\texttt{rot\_head}}(r \mid R_{r}(x)); \theta)\right],
\end{equation}
where $R_{r}(x)$ is a rotation transformation and $\mathcal{L}_{\textup{CE}}(x, r; \theta)$ is the cross-entropy between the auxiliary head's output and the ground-truth label $r \in \{ 0^{\circ}, 90^{\circ}, 180^{\circ}, 270^{\circ} \}$. In order to adapt the PGD adversary to the new training setup, we modify the loss used in the PGD update equation (2) to maximize both the rotation loss and the classification loss. In Appendix \ref{app:not_attacking_rotation}, we find that this modification is optional and that the main source of improvement comes from the rotation loss itself. We report results with the modification here, for completeness. The overall loss that PGD will try to maximize for each training image is $\mathcal{L}_{\textup{CE}}(y, p(y\mid x); \theta) + \mathcal{L}_{\textup{SS}}(x; \theta)$. At test-time, the PGD loss does not include the $\mathcal{L}_{\textup{SS}}$ term, since we want to specifically test and attack the classification robustness.
% rot_five_by_8 refers to coefficient of 0.5 on self-sup loss during training and dividing self-sup loss by 8 within adversarial search


\textbf{Results and analysis.}\quad We are able to attain large improvements over standard PGD training by adding self-supervised rotation prediction. Table \ref{tab:advresults} contains results of our model against PGD adversaries with $K=20$ and $K=100$. In both cases, we are able to achieve a 5.6\% absolute improvement over classical PGD training. In Figure \ref{fig:advresultsfig}, we observe that our method of adding auxiliary rotations actually provides larger gains over standard PGD training as the maximum perturbation distance $\varepsilon$ increases. The figure also shows that our method can withstand up to 11\% larger perturbations than PGD training without any drop in performance. A possible reason for these robustness gains is that predicting rotations can require modeling shape more so than classification alone, as demonstrated in Figure \ref{fig:zebrafig}.

In order to demonstrate that our method does not rely on gradient obfuscation, we attempted to attack our models using SPSA \citep{uesato2018adversarial} and failed to notice any performance degradation compared to standard PGD training. In addition, since our self-supervised method has the nice property of being easily adaptable to supplement other different supervised defenses, we also studied the effect of adding self-supervised rotations to stronger defenses such as TRADES \citep{Zhang2019theoretically}. We found that self-supervision is able to help in this setting as well. Our best-performing TRADES + rotations model gives a 1.22\% boost over standard TRADES and a 7.79\% boost over standard PGD training in robust accuracy. For implementation details, see code.

% Overall, our results demonstrate that self-supervision can improve robustness of representations \textit{without requiring any additional data or larger models.}

% large improvements
% can withstand x\% larger perturbations for same performance
% complementary with stronger defenses like TRADES.
% Overall, results demonstrate that self-supervision can improve robustness of representations \textit{without requiring additional data or larger models.}

% Also, ``we try to break model with SPSA, and it didn't work'', or something like that. We can treat SPSA as a method that gauges whether a defense relies on gradient obfuscation. No need to mention numbers.


\subsection{Robustness to Common Corruptions}\label{section:common_corruptions}

\textbf{Setup.}\quad
In real-world applications of computer vision systems, inputs can be corrupted in various ways that may not have been encountered during training. Improving robustness to these common corruptions is especially important in safety-critical applications. \citet{hendrycks2019robustness} create a set of fifteen test corruptions and four validation corruptions common corruptions to measure input corruption robustness. These corruptions fall into noise, blur, weather, and digital categories. Examples include shot noise, zoom blur, snow, and JPEG compression.

We use the CIFAR-10-C validation dataset from \cite{hendrycks2019robustness} and compare the robustness of normally trained classifiers to classifiers trained with an auxiliary rotation prediction loss. As in previous sections, we predict all four rotations in parallel in each batch. We use 40-2 Wide Residual Networks and the same optimization hyperparameters as before. We do not tune on the validation corruptions, so we report average performance over all corruptions. Results are in Figure \ref{fig:common_corruptions}.

\begin{figure*}[t]
    \centering
	\centerline{\includegraphics[width=1\linewidth]{figures/reordered_coff_def_taller_2.pdf}}
	\vspace{-0.1in}
	\caption{A comparison of the accuracy of usual training compared to training with auxiliary rotation self-supervision on the nineteen CIFAR-10-C corruptions. Each bar represents an average over all five corruption strengths for a given corruption type.}\label{fig:common_corruptions}
	\vspace{-5pt}
\end{figure*}

% \begin{wraptable}{r}{7cm}
%     \begin{tabular}{lcc}
%     \toprule
%                 & Normal Training & Rotations \\ \midrule
%     CIFAR-10    & 72.3 & 76.9 \\
%     CIFAR-100   & 47.2 & 47.5 \\
%     \bottomrule
%     \end{tabular}
%     \label{tab:common_corruptions}
%     \caption{The subtlety of nature is many times greater than the subtlety of argument.}
% \end{wraptable}


\textbf{Results and analysis.}\quad
The baseline of normal training achieves a clean accuracy of 94.7\% and an average accuracy over all corruptions of 72.3\%. Training with auxiliary rotations maintains clean accuracy at 95.5\% but increases the average accuracy on corrupted images by 4.6\% to 76.9\%. Thus, the benefits of self-supervision to robustness are masked by similar accuracy on clean images. Performance gains are spread across corruptions, with a small loss of performance in only one corruption type, JPEG compression. For glass blur, clean accuracy improves by 11.4\%, and for Gaussian noise it improves by 11.6\%. Performance is also improved by 8.9\% on contrast and shot noise and 4.2\% on frost, indicating substantial gains in robustness on a wide variety of corruptions. These results demonstrate that self-supervision can regularize networks to be more robust even if clean accuracy is not affected.


\subsection{Robustness to Label Corruptions}


% Introduce label noise problem. Following a certain line of work. We corrupt labels according to uniform distribution (math). Talk about how you measure performance.

% In the task of classification under label corruption, the goal is to learn as good a classifier as possible on a dataset with corrupted labels. In accordance with prior work \cite{Sukhbaatar} we focus on multi-class classification. Let $x$, $y$, and $\widetilde{y}$ be an input, clean label, and potentially corrupted label respectively. The labels take values from $1$ to $K$. Given a dataset $\mathcal{D}$ of $(x,\widetilde{y})$ pairs with $x$ drawn from $p(x)$ and $\widetilde{y}$  drawn from $p(\widetilde{y} \mid y, x)$, the task is to predict $\arg\max_y p(y \mid x)$.

% To experiment with a variety of corruption severities, we corrupt the true label with a given probability to a randomly chosen incorrect class. Formally, we generate corrupted labels with a ground truth matrix of corruption probabilities $C$, where $C_{ij} = p(\widetilde{y}=j \mid y=i)$ is the probability of corrupting an example with label $i$ to label $j$. Given a corruption strength $s$, we construct $C$ with $(1-s)I + s\mathsf{1}{\mathsf{1}}^\mathsf{T}/K$, $I$ the $K\times K$ identity matrix. To measure performance, we use the area under the curve plotting test error against corruption strength. This is generated via linear interpolation between test errors at corruption strengths from $0$ to $1$ in increments of $0.1$, summarizing a total of 11 experiments.

\textbf{Setup.}\quad
Training classifiers on corrupted labels can severely degrade performance. Thus, several prior works have explored training deep neural networks to be robust to label noise in the multi-class classification setting \cite{Sukhbaatar, Patrini, hendrycks2018glc}. We use the problem setting from these works. Let $x$, $y$, and $\widetilde{y}$ be an input, clean label, and potentially corrupted label respectively. Given a dataset $\widetilde{\mathcal{D}}$ of $(x,\widetilde{y})$ pairs for training, the task is to obtain high classification accuracy on a test dataset $\mathcal{D}_\text{test}$ of cleanly-labeled $(x,y)$ pairs.

Given a cleanly-labeled training dataset $\widetilde{\mathcal{D}}$, we generate $\widetilde{\mathcal{D}}$ with a corruption matrix $C$, where $C_{ij} = p(\widetilde{y}=j \mid y=i)$ is the probability of a ground truth label $i$ being corrupted to $j$. Where $K$ is the range of the label, we construct $C$ according to $C = (1-s)I_K + s\mathsf{1}{\mathsf{1}}^\mathsf{T}/K$. In this equation, $s$ is the corruption strength, which lies in $[0,1]$. At a corruption strength of $0$, the labels are unchanged, while at a corruption strength of $1$ the labels have an equal chance of being corrupted to any class. To measure performance, we average performance on $\mathcal{D}_\text{test}$ over corruption strengths from $0$ to $1$ in increments of $0.1$ for a total of 11 experiments.


\begin{figure*}[t]
    \centering
	\begin{subfigure}{.32\textwidth}
		\centering
		\includegraphics[width=1.0\linewidth]{figures/label_noise/cifar10_no_corr.pdf}
		\label{fig:plot11}
	\end{subfigure}
	\begin{subfigure}{.32\textwidth}
		\centering
		\includegraphics[width=1.0\linewidth]{figures/label_noise/cifar10_glc_at_5.pdf}
		\label{fig:plot14}
	\end{subfigure}
	\begin{subfigure}{.32\textwidth}
		\centering
		\includegraphics[width=1.0\linewidth]{figures/label_noise/cifar10_glc_at_10.pdf}
		\label{fig:plot13}
	\end{subfigure}

    \centering
	\begin{subfigure}{.32\textwidth}
		\centering
		\includegraphics[width=1.0\linewidth]{figures/label_noise/cifar100_no_corr.pdf}
		\label{fig:plot21}
	\end{subfigure}
	\begin{subfigure}{.32\textwidth}
		\centering
		\includegraphics[width=1.0\linewidth]{figures/label_noise/cifar100_glc_at_5.pdf}
		\label{fig:plot24}
	\end{subfigure}
	\begin{subfigure}{.32\textwidth}
		\centering
		\includegraphics[width=1.0\linewidth]{figures/label_noise/cifar100_glc_at_10.pdf}
		\label{fig:plot23}
	\end{subfigure}
	\vspace{-0.1in}
	\caption{Error curves for label corruption comparing normal training to training with auxiliary rotation self-supervision. Auxiliary rotations improve performance when training without loss corrections and are complementary with the GLC loss correction method.}\label{fig:labelnoise}
	\vspace{-10pt}
\end{figure*}


\textbf{Methods.}\quad
Training without loss correction methods or self-supervision serves as our first baseline, which we call \textit{No Correction} in Table \ref{tab:labelnoise}. Next, we compare to the state-of-the-art \textit{Gold Loss Correction (GLC)} \cite{hendrycks2018glc}. This is a two-stage loss correction method based on \cite{Sukhbaatar} and \cite{Patrini}. The first stage of training estimates the matrix $C$ of conditional corruption probabilities, which partially describes the corruption process. The second stage uses the estimate of $C$ to train a corrected classifier that performs well on the clean label distribution. The \textit{GLC} assumes access to a small dataset of trusted data with cleanly-labeled examples. Thus, we specify the percent of amount of trusted data available in experiments as a fraction of the training set. This setup is also known as a semi-verified setting \cite{semiverified}.

To investigate the effect of self-supervision, we use the combined loss $\mathcal{L}_{\textup{CE}}(y, p(y\mid x); \theta) + \lambda\mathcal{L}_\text{SS}(x; \theta)$, where the first term is standard cross-entropy loss and the second term is the auxiliary rotation loss defined in \Cref{section:adv}. We call this \textit{Rotations} in Table \ref{tab:labelnoise}. In all experiments, we set $\lambda = 0.5$. \citet{rotnet} demonstrate that predicting rotations can yield effective representations for subsequent fine-tuning on target classification tasks. We build on this approach and pre-train with the auxiliary rotation loss alone for 100 epochs, after which we fine-tune for 40 epochs with the combined loss.

We use 40-2 Wide Residual Networks \citep{wideresnet}. Hyperparameters remain unchanged from \Cref{section:adv}. To select the number of fine-tuning epochs, we use a validation split of the CIFAR-10 training dataset with clean labels and select a value to bring accuracy close to that of \textit{Normal Training}. Results are in \Cref{tab:labelnoise} and performance curves are in \Cref{fig:labelnoise}.

% Fine-tuning with cross-entropy alone yields significantly worse results. Results are in Table \#, and Figure \# shows loss curves for the methods we evaluate. (TODO: rerun these experiments and report numbers)


\begin{table*}[ht]
% \vspace{-5pt}
\begin{center}
\begin{tabular}{lcccc}
\toprule
                          & \multicolumn{2}{c}{CIFAR-10} & \multicolumn{2}{c}{CIFAR-100} \\ \cmidrule(lr){2-3}\cmidrule(lr){4-5}
                            & Normal Training      & Rotations     & Normal Training       & Rotations      \\ \midrule
No Correction               & 27.4   & 21.8     & 52.6  & 47.4      \\
% Forward Correction          & 29.9   & 25.8     & 53.9  & 49.9      \\
GLC (5\% Trusted)           & 14.6       & 10.5         & 48.3      & 43.2          \\
GLC (10\% Trusted)          & 11.6       & 9.6         & 39.1      & 36.8           \\ \bottomrule
\end{tabular}
\end{center}
% \vspace{0pt}
\caption{Label corruption results comparing normal training to training with auxiliary rotation self-supervision. Each value is the average error over 11 corruption strengths. All values are percentages. The reliable training signal from self-supervision improves resistance to label noise.}
\label{tab:labelnoise}
\vspace{-15pt}
\end{table*}


\textbf{Analysis.}\quad
We observe large gains in robustness from auxiliary rotation prediction. Without loss corrections, we reduce the average error by 5.6\% on CIFAR-10 and 5.2\% on CIFAR-100. This corresponds to an 11\% relative improvement over the baseline of normal training on CIFAR-100 and a 26\% relative improvement on CIFAR-10. In fact, auxiliary rotation prediction with no loss correction outperforms the GLC with 5\% trusted data on CIFAR-100. This is surprising given that the GLC was developed specifically to combat label noise.

We also observe additive effects with the GLC. On CIFAR-10, the GLC with 5\% trusted data obtains 14.6\% average error, which is reduced to 10.5\% with the addition of auxiliary rotation prediction. Note that doubling the amount of trusted data to 10\% yields 11.6\% average error. Thus, using self-supervision can enable obtaining better performance than doubling the amount of trusted data in a semi-supervised setting. On CIFAR-100, we observe similar complementary gains from auxiliary rotation prediction. Qualitatively, we can see in \Cref{fig:labelnoise} that performance degradation as the corruption strength increases is softer with auxiliary rotation prediction.

On CIFAR-100, error at 0\% corruption strength is 2.3\% higher with auxiliary rotation predictions. This is because we selected the number of fine-tuning epochs on CIFAR-10 at 0\% corruption strength, for which the degradation is only 1.3\%. Fine-tuning for longer can eliminate this gap, but also leads to overfitting label noise \citep{noise_label_overfitting}. Controlling this trade-off of robustness to performance on clean data is application-specific. However, past a corruption strength of 20\%, auxiliary rotation predictions improve performance for all tested corruption strengths and methods.

% \textbf{Ablations.}\quad
% Fine-tuning with cross-entropy alone yields significantly worse results, so combined training is important. Removing the pre-training step also harms performance, because the model overfits to the label noise as has been observed in prior work [CITE]. Simply training for 40 epochs without self-supervised pre-training also doesn't work, so our method does not simply underfit the data.


\section{Out-of-Distribution Detection}
We show self-supervised learning can aid out-of-distribution detection. Self-supervised learning with rotation prediction enables the detection of harder out-of-distribution examples. In the following two sections, we show that self-supervised learning improves out-of-distribution detection when the in-distribution consists in multiple classes or a single class.

\subsection{Multi-Class Out-of-Distribution Detection.}

\noindent\textbf{Setup.}\quad In the following experiment, we train a CIFAR-10 classifier and use it as an out-of-distribution detector. When given an example $x$, we write the classifier's posterior distribution over the ten classes with $p(y\mid x)$. \cite{hendrycks17baseline} show that $p(y\mid x)$ can enable the detection of out-of-distribution examples. They show that the maximum softmax probability $\max_c p(y=c \mid x)$ tends to be higher for in-distribution examples than for out-of-distribution examples across a range of tasks, and this confidence discrepancy allows for the separation and detection of out-of-distribution examples. \citet{outlier_exposure} show that KL divergence of the softmax prediction to the uniform distribution $U$ performs similarly to $\max_c p(y=c \mid x)$.

We evaluate each OOD detector using the area under the receiver operating characteristic curve (AUROC) \citep{auroc}. Given an input image that, each OOD detector produces an anomaly score as a single real number. The AUROC is equal to the probability an out-of-distribution example has a higher anomaly score than an in-distribution example. Thus an OOD detector with a 50\% AUROC is at random-chance levels, and one with a 100\% AUROC is without a performance flaw.

\noindent\textbf{Method.}\quad We train a classifier with an auxiliary self-supervised rotation loss. The loss during training is $\mathcal{L}_{\text{CE}}(y, p(y\mid x)) + \sum_{  r \in \{ 0^{\circ}, 90^{\circ}, 180^{\circ}, 270^{\circ} \}  } \mathcal{L}_{\textup{CE}}(\texttt{one\_hot}(r), p_{\texttt{rot\_head}}(r \mid R_{r}(x)))$, and we only train on in-distribution CIFAR-10 training examples. After training is complete, we score in-distribution CIFAR-10 test set examples and out-of-distribution examples with the formula
$
\text{KL}[U\|p(y\mid x)] + \frac{1}{4} \sum_{  r \in \{ 0^{\circ}, 90^{\circ}, 180^{\circ}, 270^{\circ} \}  } \mathcal{L}_{\text{CE}}(\texttt{one\_hot}(r),  p_{\texttt{rot\_head}}(r \mid R_{r}(x))).
$

\begin{wrapfigure}{r}{0.35\textwidth}
\vspace{-25pt}
\begin{center}
\begin{tabular}{l|c}
\toprule
    Method & \qquad AUROC \\ \midrule
Baseline & \qquad 91.4\%\\ % strengthen this baseline so it's more of a victory
% \cdashline{1-2}
Rotations (Ours) & \qquad 96.2\%\\
\bottomrule
\end{tabular}
\end{center}
\caption{OOD detection performance of the maximum softmax probability baseline and our method using self-supervision. Full results are in \Cref{app:multiclass}.}
\label{tab:multiclassood}
\vspace{-10pt}
\end{wrapfigure}

The training loss is standard cross-entropy loss with auxiliary rotation prediction. The detection score is the KL divergence detector from prior work with a rotation score added to it. The rotation score consists of the cross entropy of the rotation softmax distribution to the categorical distribution over rotations with probability $1$ at the current rotation and $0$ everywhere else. This is equivalent to the negative log probability assigned to the true rotation. Summing the cross entropies over the rotations gives the total rotation score.

\noindent\textbf{Results and Analysis.}\quad
We evaluate this proposed method against the maximum softmax probability baseline \citep{hendrycks17baseline} on a wide variety of anomalies with CIFAR-10 as the in-distribution data. For the anomalies, we select Gaussian, Rademacher, Blobs, Textures, SVHN, Places365, LSUN, and CIFAR-100 images. We observe performance gains across the board and report average AUROC values in \Cref{tab:multiclassood}. On average, the rotation method increases the AUROC by 4.8\%.\looseness=-1

This method does not require additional data as in Outlier Exposure \citep{outlier_exposure}, although combining the two could yield further benefits. As is, the performance gains are of comparable magnitude to more complex methods proposed in the literature \citep{adverkaiming}. This demonstrates that self-supervised auxiliary rotation prediction can augment OOD detectors based on fully supervised multi-class representations. More detailed descriptions of the OOD datasets and the full results on each anomaly type with additional metrics are in \Cref{app:multiclass}.

\subsection{One-Class Learning}
\textbf{Setup.}\quad
In the following experiments, we take a dataset consisting in $k$ classes and train a model on one class. This model is used as an out-of-distribution detector. For the source of OOD examples, we use the examples from the remaining unseen $k-1$ classes. Consequently, for the datasets we consider, the OOD examples are near the in-distribution and make for a difficult out-of-distribution detection challenge.


\subsubsection{CIFAR-10}

\begin{table*}[ht]
\small
\setlength\tabcolsep{3pt}
\begin{center}
\begin{tabular}{lcccccc|c|cc}
\toprule
                    & OC-SVM & DeepSVDD & Geometric & RotNet & DIM & IIC & Supervised (OE) & Ours & Ours + OE
                          \\ \midrule
Airplane            & 65.6  & 61.7 & 76.2 & 71.9 & 72.6 & 68.4 & 87.6 & 77.5 & 90.4 \\
Automobile          & 40.9  & 65.9 & 84.8 & 94.5 & 52.3 & 89.4 & 93.9 & 96.9 & 99.3 \\
Bird                & 65.3  & 50.8 & 77.1 & 78.4 & 60.5 & 49.8 & 78.6 & 87.3 & 93.7 \\
Cat                 & 50.1  & 59.1 & 73.2 & 70.0 & 53.9 & 65.3 & 79.9 & 80.9 & 88.1 \\
Deer                & 75.2  & 60.9 & 82.8 & 77.2 & 66.7 & 60.5 & 81.7 & 92.7 & 97.4 \\
Dog                 & 51.2  & 65.7 & 84.8 & 86.6 & 51.0 & 59.1 & 85.6 & 90.2 & 94.3 \\
Frog                & 71.8  & 67.7 & 82.0 & 81.6 & 62.7 & 49.3 & 93.3 & 90.9 & 97.1 \\
Horse               & 51.2  & 67.3 & 88.7 & 93.7 & 59.2 & 74.8 & 87.9 & 96.5 & 98.8 \\
Ship                & 67.9  & 75.9 & 89.5 & 90.7 & 52.8 & 81.8 & 92.6 & 95.2 & 98.7 \\
Truck               & 48.5  & 73.1 & 83.4 & 88.8 & 47.6 & 75.7 & 92.1 & 93.3 & 98.5 \\ \hline
Mean                & 58.8  & 64.8 & 82.3 & 83.3 & 57.9 & 67.4 & 87.3 & 90.1 & 95.6 \\
\bottomrule
\end{tabular}
\end{center}
\caption{AUROC values of different OOD detectors trained on one of ten CIFAR-10 classes. Test time out-of-distribution examples are from the remaining nine CIFAR-10 classes. In-distribution examples are examples belonging to the row's class. Our self-supervised technique surpasses a fully supervised model. All values are percentages.}
\label{tab:cifar}
\end{table*}

\textbf{Baselines.}\quad
One-class SVMs \citep{OC-SVM} are an unsupervised out-of-distribution detection technique which models the training distribution by finding a small region containing most of the training set examples, and points outside this region are deemed OOD. In our experiment, OC-SVMs operate on the raw CIFAR-10 pixels. Deep SVDD \citep{DeepSVDD} uses convolutional networks to extract features from the raw pixels all while modelling one class, like OC-SVMs.

RotNet \citep{rotnet} is a successful self-supervised technique which learns its representations by predicting whether an input is rotated 0\degree, 90\degree, 180\degree, or 270\degree. After training RotNet, we use the softmax probabilities to determine whether an example is in- or out-of-distribution. To do this, we feed the network the original example (0\degree) and record RotNet's softmax probability assigned to the 0\degree\hspace{1pt} class. We then rotate the example 90\degree\, and record the probability assigned to the 90\degree\, class. We do the same for 180\degree\, and 270\degree, and add up these probabilities. The sum of the probabilities of in-distribution examples will tend to be higher than the sum for OOD examples, so the negative of this sum is the anomaly score. Next, \citet{golan} (Geometric) predicts transformations such as rotations and whether an input is horizontally flipped; we are the first to connect this method to self-supervised learning and we improve their method. Deep InfoMax \citep{deepinfomax} networks learn representations which have high mutual information with the input; for detection we use the scores of the discriminator network. A recent self-supervised technique is Invariant Information Clustering (IIC) \citep{IID} which teaches networks to cluster images without labels but instead by learning representations which are invariant to geometric perturbations such as rotations, scaling, and skewing. For our supervised baseline, we use a deep network which performs logistic regression, and for the negative class we use Outlier Exposure. In Outlier Exposure, the network is exposed to examples from a real, diverse dataset of consisting in out-of-distribution examples. Done correctly, this process teaches the network to generalize to unseen anomalies. For the outlier dataset, we use 80 Million Tiny Images \citep{80mil_tiny_images} with CIFAR-10 and CIFAR-100 examples removed. Crucial to the success of the supervised baseline is our loss function choice. To ensure the supervised baseline learns from hard examples, we use the Focal Loss \citep{focal_loss}.


\textbf{Method.}\quad
For our self-supervised one-class OOD detector, we use a deep network to predict geometric transformations and thereby surpass previous work and the fully supervised network. Examples are rotated 0\degree, 90\degree, 180\degree, or 270\degree\, then translated 0 or $\pm 8$ pixels vertically and horizontally. These transformations are composed together, and the network has three softmax heads: one for predicting rotation, one for predicting vertical translations, and one for predicting horizontal translations. Concretely, the anomaly score for an example $x$ is
\begin{align*}
\sum_{  r \in \mathcal{R}  }
\sum_{  p \in \mathcal{T}  }
\sum_{  q \in \mathcal{T}  }
f_R(G(x))_r + f_P(G(x))_p + f_Q(G(x))_q,
\end{align*}
where $G$ is the composition of rotations and translations specified by $r$, $p$, and $q$ respectively. $\mathcal{R}$ is the set of rotations, $R$ is the rotation operator, and $f_R$ is the softmax output of the rotation predictor. Likewise with translations for $\mathcal{T}$, $P$, $Q$, $f_P$, and $f_Q$. The backbone architecture is a 16-4 WideResNet \citep{wideresnet} trained with a dropout rate of 0.3 \citep{dropout}. We choose a 16-4 network because there are fewer training samples. Networks are trained with a cosine learning rate schedule \citep{sgdr}, an initial learning rate of 0.1, Nesterov momentum, and a batch size of 128. Data is augmented with standard cropping and mirroring. Our RotNet and supervised baseline use the same backbone architecture and training hyperparameters. When training our method with Outlier Exposure, we encourage the network to have uniform softmax responses on out-of-distribution data. For Outlier Exposure to work successfully, we applied the aforementioned geometric transformations to the outlier images so that the in-distribution data and the outliers are as similar as possible.

Results are in \Cref{tab:cifar}. Notice many self-supervised techniques perform better than methods specifically designed for one-class learning. Also notice that our self-supervised technique outperforms Outlier Exposure, the state-of-the-art fully supervised method, which also requires access to out-of-distribution samples to train. In consequence, a model trained with self-supervision can surpass a fully supervised model. Combining our self-supervised technique with supervision through Outlier Exposure nearly solves this CIFAR-10 task.


% \vspace{-10pt}
\subsubsection{ImageNet}
% Make 30 in final paper, so there are 3k examples.

% \begin{figure*}
%     \centering
% 	\centerline{\includegraphics[width=1\linewidth]{figures/not_hotdog.png}}
% 	\vspace{-0.1in}
% 	\caption{abc}\label{fig:hotdog}
% 	\vspace{-5pt}
% \end{figure*}

\textbf{Dataset.}\quad We consequently turn to a harder dataset to test self-supervised techniques. For this experiment, we select 30 classes from ImageNet \cite{imagenet}. See \Cref{app:imagenetoodclasses} for the classes.

\textbf{Method.}\quad Like before, we demonstrate that a self-supervised model can surpass a model that is fully supervised. The fully supervised model is trained with Outlier Exposure using ImageNet-22K outliers (with ImageNet-1K images removed).
The architectural backbone for these experiments is a ResNet-18. Images are resized such that the smallest side has 256 pixels, while the aspect ratio is maintained. Images are randomly cropped to the size $224\times224\times3$. Since images are larger than CIFAR-10, new additions to the self-supervised method are possible. Consequently, we can teach the network to predict whether than image has been resized. In addition, since we should like the network to more easily learn shape and compare regions across the whole image, we discovered there is utility in self-attention \citep{Woo_2018_ECCV} for this task. Other architectural changes, such as using a Wide \emph{RevNet} \citep{Behrmann2018InvertibleRN} instead of a Wide ResNet, can increase the AUROC from 65.3\% to 77.5\%. AUROCs are shown in \Cref{tab:imagenet}. Self-supervised methods outperform the fully supervised baseline by a large margin, yet there is still wide room for improvement on large-scale OOD detection.


\begin{table}[ht]
\vspace{5pt}
\begin{center}
\begin{tabular}{l|c}
\toprule
    Method & AUROC \\ \midrule
Supervised (OE) & 56.1\\ % strengthen this baseline so it's more of a victory
\cdashline{1-2}
RotNet & 65.3\\
RotNet + Translation & 77.9 \\
RotNet + Self-Attention & 81.6\\
RotNet + Translation + Self-Attention & 84.8 \\
RotNet + Translation + Self-Attention + Resize  (Ours) & 85.7 \\
\bottomrule
\end{tabular}
\end{center}
\caption{AUROC values of supervised and self-supervised OOD detectors. AUROC values are an average of 30 AUROCs corresponding to the 30 different models trained on exactly one of the 30 classes. Each model's in-distribution examples are from one of 30 classes, and the test out-of-distribution samples are from the remaining 29 classes. The self-supervised methods greatly outperform the supervised method. All values are percentages.}
\vspace{-5pt}
\label{tab:imagenet}
\end{table}


\section{Conclusion}
In this paper, we applied self-supervised learning to improve the robustness and uncertainty of deep learning models beyond what was previously possible with purely supervised approaches. We found large improvements in robustness to adversarial examples, label corruption, and common input corruptions. For all types of robustness that we studied, we observed consistent gains by supplementing current supervised methods with an auxiliary rotation loss. We also found that self-supervised methods can drastically improve out-of-distribution detection on difficult, near-distribution anomalies, and that in CIFAR and ImageNet experiments, self-supervised methods outperform fully supervised methods. Self-supervision had the largest improvement over supervised techniques in our ImageNet experiments, where the larger input size meant that we were able to apply a more complex self-supervised objective. Our results suggest that future work in building more robust models and better data representations could benefit greatly from self-supervised approaches.

\newpage
\subsection{Acknowledgments}
This material is in part based upon work supported by the National Science Foundation Frontier Grant.
Any opinions, findings, and conclusions or recommendations expressed in this material are those of
the author(s) and do not necessarily reflect the views of the National Science Foundation.
% check if accurate

\bibliography{biblio}
\bibliographystyle{plainnat}

\newpage
% \appendix

\begin{appendices}
\crefalias{section}{appsec}

\section{Self-Supervised Learning for Multi-Class OOD Detection}\label{app:multiclass}

\begin{table}[ht]
% \small
\setlength{\tabcolsep}{8pt}
\centering
\begin{tabularx}{\textwidth}{*{1}{>{\hsize=0.3\hsize}X} *{1}{>{\hsize=1.8cm}X }
| *{2}{>{\hsize=0.65\hsize}Y}
|*{2}{>{\hsize=0.65\hsize}Y}
| *{2}{>{\hsize=0.65\hsize}Y} }
% \toprule
\multicolumn{2}{c}{} & \multicolumn{2}{c}{FPR95 $\downarrow$} & \multicolumn{2}{c}{AUROC $\uparrow$} &\multicolumn{2}{c}{AUPR  $\uparrow$}\\ \cline{3-8}
$\mathcal{D}_\text{in}$ & \multicolumn{1}{l|}{$\mathcal{D}_\text{out}^\text{test}$} &
{MSP} & {Rotation} & {MSP} & {Rotation} & {MSP} & {Rotation} \\ \hline
 \parbox[t]{50mm}{\multirow{8}{*}{\rotatebox{90}{CIFAR-10}}}
 & Gaussian  & 8.1  & 1.2 & 96.3  & 99.0 & 70.8 & 85.6 \\
 & Rademacher& 5.9  & 1.1 & 97.5  & 99.1 & 79.4 & 86.3 \\
 & Blobs     & 13.3 & 2.3 & 94.6  & 98.9 & 68.3 & 86.5 \\
 & Textures  & 45.4 & 8.9 & 87.9  & 97.4 & 56.2 & 86.7 \\
 & SVHN      & 25.7 & 2.7 & 91.9  & 98.9 & 64.0 & 89.8 \\
 & Places365 & 46.0 & 38.4 & 87.7 & 92.2 & 57.2 & 71.3 \\
 & LSUN      & 39.5 & 28.7 & 88.5 & 93.2 & 57.2 & 71.0 \\
 & CIFAR-100 & 45.9 & 44.9 & 87.2 & 90.9 & 54.1 & 67.7 \\
 \Xhline{0.5\arrayrulewidth} \multicolumn{2}{c|}{Mean} & {28.7} & {\textbf{16.0}} & {91.4} & {\textbf{96.2}} & {63.4} & {\textbf{80.6}} \\
\Xhline{3\arrayrulewidth}
\end{tabularx}
\caption{Out-of-distribution example detection results for the maximum softmax probability (MSP) baseline and our rotation method. All results are percentages and the average result of 5 runs.}
\label{tab:oodmulti}
\end{table}

The full multi-class out-of-distribution detection results are in \Cref{tab:oodmulti}. Auxiliary rotation prediction results in large improvements across the board for numerous anomaly types. In all cases, rotation prediction improves performance. This demonstrates that auxiliary rotation prediction is not only useful for one-class detection but can also augment detectors based on multi-class representations. For descriptions of metrics, we refer the reader to \cite{outlier_exposure}.

\textbf{OOD Datasets.}\quad
For multi-class OOD detection, we evaluate our detectors on a wide variety of OOD data with CIFAR-10 as the in-distribution. \textit{Gaussian} OOD data has each pixel sampled from an isotropic Gaussian distribution. \textit{Rademacher} images have each pixel sampled IID from an Rademacher distribution, which takes values $1$ and $-1$ with equal probability. \textit{Blobs} images are algorithmically generated amorphous shapes with distinct edges. \textit{Textures} is a dataset of describable texture images. \textit{SVHN} is a dataset of house numbers extracted from Google Street View. \textit{Places365} contains images for scene recognition instead of object recognition. \textit{LSUN} is another scene understanding dataset that fewer classes than Places365 \citep{lsun}. \textit{CIFAR-100} is the 100-class counterpart to CIFAR-10. Importantly, the CIFAR-10 and CIFAR-100 classes do not overlap, so CIFAR-100 data is OOD with CIFAR-10 as the in-distribution.

%5.32\% to 3.92\% error rate. Describe dataset. Describe anomaly score.

\section{ImageNet OOD Dataset}\label{app:imagenetoodclasses}
The classes are `acorn', `airliner', `ambulance', `American alligator', `banjo', `barn', `bikini', `digital clock', `dragonfly', `dumbbell', `forklift', `goblet', `grand piano', `hotdog', `hourglass', `manhole cover', `mosque', `nail', `parking meter', `pillow', `revolver', `rotary dial telephone', `schooner', `snowmobile', `soccer ball', `stingray', `strawberry', `tank', `toaster', and `volcano'.  These classes were selected so that there is no obvious overlap, unlike classes such as `bee' and `honeycomb.' There are 1,300 training images per class, and 100 test images per class. To create a dataset with 100 test images per class, we took ImageNet's 50 validation images, and we collected an additional 50 images for each class for an expanded test set. The data is available for download at \href{https://github.com/hendrycks/ss-ood}{\texttt{https://github.com/hendrycks/ss-ood}}.

\section{Additional Ablations}\label{app:not_attacking_rotation}
%Give multiclass some screen time since it'd be good to claim that too.\\

\textbf{Not attacking the rotation branch.}\quad
To gauge the effect of attacking the rotation branch during training in \Cref{section:adv}, we retrain the auxiliary rotation method with the adversary only attacking the classification branch. We find this performs similarly to attacking both the classification and rotation branches, which indicates that the rotation loss itself is the crucial component.

\textbf{Comparison with rotation augmentation.}\quad
Our results demonstrate myriad benefits of rotation prediction, so a natural baseline for comparison is rotation data augmentation. To this end, we retrain the baseline network from \Cref{section:common_corruptions} and augment the dataset with rotations of multiples of 90 degrees. We find that this \textit{decreases} average accuracy across corruptions from 72.3\% to 63.7\%. By contrast, training with auxiliary rotation prediction improves average accuracy to 76.9\%.

\end{appendices}

\end{document}