Created
May 12, 2014 16:44
-
-
Save denis-bz/6b53badaf256566f59ae to your computer and use it in GitHub Desktop.
Trim outliers in Numpy arrays: smallest n values = the next smallest, biggest n = the next biggest
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" trim outliers: smallest n values = next smallest, biggest n = next biggest | |
X, smallest, biggest = winsorize( X, n ) | |
-> X trimmed *inplace* | |
smallest: n+1 smallest | |
biggest: n+1 biggest | |
To winsorize the top 5 % and bottom 5 %, | |
winsorize( X, X.size * .05 ) | |
X may be 2d, 3d ... (but winsorizing each dimension separately would make more sense) | |
See also http://en.wikipedia.org/wiki/Winsorising | |
""" | |
# http://en.wikipedia.org/wiki/Least_trimmed_squares | |
import numpy as np | |
def winsorize( X, n=1 ): | |
winsorize.__doc__ = globals()["__doc__"] # as above | |
X = np.asarray_chkfinite(X) | |
shape = X.shape | |
X = X.reshape(-1) # flat | |
J = np.argsort( X ) # fast enough, 1m: 135 ms on 2.7GHz imac | |
if not isinstance( n, int ): | |
n = max( int(round( n )), 1 ) | |
jsmall = J[:n+1] | |
jbig = J[-n-1:] | |
small = X[jsmall] | |
big = X[jbig] | |
X[jsmall[:-1]] = X[jsmall[-1]] | |
X[jbig[1:]] = X[jbig[0]] | |
return X.reshape(shape), small, big | |
#............................................................................... | |
if __name__ == "__main__": | |
import sys | |
N = 1e6 | |
nwin = 1 | |
seed = 0 | |
# run this.py a=1 b=None c=[3] 'd = expr' ... in sh or ipython | |
for arg in sys.argv[1:]: | |
exec( arg ) | |
np.set_printoptions( 2, threshold=100, edgeitems=10, linewidth=100, suppress=True, | |
formatter = dict( float = lambda x: "%.2g" % x )) | |
np.random.seed(seed) | |
N = int(N) | |
#............................................................................... | |
X = np.random.uniform( size=N ) | |
X, small, big = winsorize( X, nwin ) | |
print "winsorize( %d, %d )" % (N, nwin) | |
print "smallest:", small | |
print "biggest: 1 -", 1 - big | |
print "X.min %.2g \nX.max 1 - %.2g" % (X.min(), 1 - X.max()) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment