Skip to content

Instantly share code, notes, and snippets.

@leggitta
Created December 31, 2015 20:06
Show Gist options
  • Save leggitta/2980f835fc62dedc99c5 to your computer and use it in GitHub Desktop.
Save leggitta/2980f835fc62dedc99c5 to your computer and use it in GitHub Desktop.
import numpy as np
from scipy.stats import t, zscore
def grubbs(X, test='two-tailed', alpha=0.05):
'''
Performs Grubbs' test for outliers recursively until the null hypothesis is
true.
Parameters
----------
X : ndarray
A numpy array to be tested for outliers.
test : str
Describes the types of outliers to look for. Can be 'min' (look for
small outliers), 'max' (look for large outliers), or 'two-tailed' (look
for both).
alpha : float
The significance level.
Returns
-------
X : ndarray
The original array with outliers removed.
outliers : ndarray
An array of outliers.
'''
Z = zscore(X, ddof=1) # Z-score
N = len(X) # number of samples
# calculate extreme index and the critical t value based on the test
if test == 'two-tailed':
extreme_ix = lambda Z: np.abs(Z).argmax()
t_crit = lambda N: t.isf(alpha / (2.*N), N-2)
elif test == 'max':
extreme_ix = lambda Z: Z.argmax()
t_crit = lambda N: t.isf(alpha / N, N-2)
elif test == 'min':
extreme_ix = lambda Z: Z.argmin()
t_crit = lambda N: t.isf(alpha / N, N-2)
else:
raise ValueError("Test must be 'min', 'max', or 'two-tailed'")
# compute the threshold
thresh = lambda N: (N - 1.) / np.sqrt(N) * \
np.sqrt(t_crit(N)**2 / (N - 2 + t_crit(N)**2))
# create array to store outliers
outliers = np.array([])
# loop throught the array and remove any outliers
while abs(Z[extreme_ix(Z)]) > thresh(N):
# update the outliers
outliers = np.r_[outliers, X[extreme_ix(Z)]]
# remove outlier from array
X = np.delete(X, extreme_ix(Z))
# repeat Z score
Z = zscore(X, ddof=1)
N = len(X)
return X, outliers
if __name__ == "__main__":
# setup some test arrays
X = np.arange(-5, 6)
X1 = np.r_[X, 100]
X2 = np.r_[X, -100]
# test the two-tailed case
Y, out = grubbs(X1)
assert out == 100
Y, out = grubbs(X2)
assert out == -100
# test the max case
Y, out = grubbs(X1, test='max')
assert out == 100
Y, out = grubbs(X2, test='max')
assert len(out) == 0
# test the min case
Y, out = grubbs(X1, test='min')
assert len(out) == 0
Y, out = grubbs(X2, test='min')
assert out == -100
@hauckjo
Copy link

hauckjo commented May 19, 2021

Where can I get the output without the outliers?

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment