leggitta/grubbs.py

## grubbs.py
import numpy as np
from scipy.stats import t, zscore


def grubbs(X, test='two-tailed', alpha=0.05):

    '''
    Performs Grubbs' test for outliers recursively until the null hypothesis is
    true.

    Parameters
    ----------
    X : ndarray
        A numpy array to be tested for outliers.
    test : str
        Describes the types of outliers to look for. Can be 'min' (look for
        small outliers), 'max' (look for large outliers), or 'two-tailed' (look
        for both).
    alpha : float
        The significance level.

    Returns
    -------
    X : ndarray
        The original array with outliers removed.
    outliers : ndarray
        An array of outliers.
    '''

    Z = zscore(X, ddof=1)  # Z-score
    N = len(X)  # number of samples

    # calculate extreme index and the critical t value based on the test
    if test == 'two-tailed':
        extreme_ix = lambda Z: np.abs(Z).argmax()
        t_crit = lambda N: t.isf(alpha / (2.*N), N-2)
    elif test == 'max':
        extreme_ix = lambda Z: Z.argmax()
        t_crit = lambda N: t.isf(alpha / N, N-2)
    elif test == 'min':
        extreme_ix = lambda Z: Z.argmin()
        t_crit = lambda N: t.isf(alpha / N, N-2)
    else:
        raise ValueError("Test must be 'min', 'max', or 'two-tailed'")

    # compute the threshold
    thresh = lambda N: (N - 1.) / np.sqrt(N) * \
        np.sqrt(t_crit(N)**2 / (N - 2 + t_crit(N)**2))

    # create array to store outliers
    outliers = np.array([])

    # loop throught the array and remove any outliers
    while abs(Z[extreme_ix(Z)]) > thresh(N):

        # update the outliers
        outliers = np.r_[outliers, X[extreme_ix(Z)]]
        # remove outlier from array
        X = np.delete(X, extreme_ix(Z))
        # repeat Z score
        Z = zscore(X, ddof=1)
        N = len(X)

    return X, outliers


if __name__ == "__main__":

    # setup some test arrays
    X = np.arange(-5, 6)
    X1 = np.r_[X, 100]
    X2 = np.r_[X, -100]

    # test the two-tailed case
    Y, out = grubbs(X1)
    assert out == 100
    Y, out = grubbs(X2)
    assert out == -100

    # test the max case
    Y, out = grubbs(X1, test='max')
    assert out == 100
    Y, out = grubbs(X2, test='max')
    assert len(out) == 0

    # test the min case
    Y, out = grubbs(X1, test='min')
    assert len(out) == 0
    Y, out = grubbs(X2, test='min')
    assert out == -100
	import numpy as np
	from scipy.stats import t, zscore


	def grubbs(X, test='two-tailed', alpha=0.05):

	'''
	Performs Grubbs' test for outliers recursively until the null hypothesis is
	true.

	Parameters
	----------
	X : ndarray
	A numpy array to be tested for outliers.
	test : str
	Describes the types of outliers to look for. Can be 'min' (look for
	small outliers), 'max' (look for large outliers), or 'two-tailed' (look
	for both).
	alpha : float
	The significance level.

	Returns
	-------
	X : ndarray
	The original array with outliers removed.
	outliers : ndarray
	An array of outliers.
	'''

	Z = zscore(X, ddof=1) # Z-score
	N = len(X) # number of samples

	# calculate extreme index and the critical t value based on the test
	if test == 'two-tailed':
	extreme_ix = lambda Z: np.abs(Z).argmax()
	t_crit = lambda N: t.isf(alpha / (2.*N), N-2)
	elif test == 'max':
	extreme_ix = lambda Z: Z.argmax()
	t_crit = lambda N: t.isf(alpha / N, N-2)
	elif test == 'min':
	extreme_ix = lambda Z: Z.argmin()
	t_crit = lambda N: t.isf(alpha / N, N-2)
	else:
	raise ValueError("Test must be 'min', 'max', or 'two-tailed'")

	# compute the threshold
	thresh = lambda N: (N - 1.) / np.sqrt(N) * \
	np.sqrt(t_crit(N)2 / (N - 2 + t_crit(N)2))

	# create array to store outliers
	outliers = np.array([])

	# loop throught the array and remove any outliers
	while abs(Z[extreme_ix(Z)]) > thresh(N):

	# update the outliers
	outliers = np.r_[outliers, X[extreme_ix(Z)]]
	# remove outlier from array
	X = np.delete(X, extreme_ix(Z))
	# repeat Z score
	Z = zscore(X, ddof=1)
	N = len(X)

	return X, outliers


	if __name__ == "__main__":

	# setup some test arrays
	X = np.arange(-5, 6)
	X1 = np.r_[X, 100]
	X2 = np.r_[X, -100]

	# test the two-tailed case
	Y, out = grubbs(X1)
	assert out == 100
	Y, out = grubbs(X2)
	assert out == -100

	# test the max case
	Y, out = grubbs(X1, test='max')
	assert out == 100
	Y, out = grubbs(X2, test='max')
	assert len(out) == 0

	# test the min case
	Y, out = grubbs(X1, test='min')
	assert len(out) == 0
	Y, out = grubbs(X2, test='min')
	assert out == -100