Skip to content

Instantly share code, notes, and snippets.

@mutaku
Last active October 11, 2016 19:14
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save mutaku/461a28d9b923cd3d1d7329d63ac12690 to your computer and use it in GitHub Desktop.
Save mutaku/461a28d9b923cd3d1d7329d63ac12690 to your computer and use it in GitHub Desktop.
def mad(data, b=None):
"""Median Absolute Distance of data"""
if not b:
b = 1 / norm.ppf(0.75)
median_of_data = np.median(data)
distances_from_median = np.median(map(lambda x: abs(x - median_of_data),
data))
return b * distances_from_median
def outliers(data, mad, bounds='both', cutoff=3.0):
"""
Use MAD to identify outliers in data set
Per Miller 1991
http://dx.doi.org/10.1080/14640749108400962,
cutoff: 3 (very conservative)
2.5 (moderately conservative)
2 (poorly conservative)
bounds: 'both' finds outliers above and below
'lower' finds outliers below data
'upper' finds outliers above data
"""
median_of_data = np.median(data)
lower_bound = median_of_data - (mad * cutoff)
upper_bound = median_of_data + (mad * cutoff)
limits = {
'both': lambda x: x > upper_bound or x < lower_bound,
'upper': lambda x: x > upper_bound,
'lower': lambda x: x < lower_bound}
return filter(limits[bounds], data)
def apply_outlier_mask(data, data_outliers=None, bounds='both', cutoff=3.0):
"""Generate a data mask identifying outliers via MAD"""
if not data_outliers:
data_outliers = outliers(data, mad(data), bounds, cutoff)
return map(lambda x: 1 if x in data_outliers else 0, data)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment