Skip to content

Instantly share code, notes, and snippets.

@dylanbstorey
Created February 8, 2017 01:16
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save dylanbstorey/5fcdb7e631f64ea44c09f5d6cb67ae7d to your computer and use it in GitHub Desktop.
Save dylanbstorey/5fcdb7e631f64ea44c09f5d6cb67ae7d to your computer and use it in GitHub Desktop.
Outlier Detection
def is_outlier(list, threshold=3.5):
"""
Returns a boolean array with True if points are outliers and False
otherwise.
Parameters:
-----------
points : An numobservations by numdimensions array of observations
thresh : The modified z-score to use as a threshold. Observations with
a modified z-score (based on the median absolute deviation) greater
than this value will be classified as outliers.
Returns:
--------
mask : A numobservations-length boolean array.
References:
----------
Boris Iglewicz and David Hoaglin (1993), "Volume 16: How to Detect and
Handle Outliers", The ASQC Basic References in Quality Control:
Statistical Techniques, Edward F. Mykytka, Ph.D., Editor.
"""
if len(points.shape) == 1:
points = points[:,None]
median = np.median(points, axis=0)
diff = np.sum((points - median)**2, axis=-1)
diff = np.sqrt(diff)
med_abs_deviation = np.median(diff)
modified_z_score = 0.6745 * diff / med_abs_deviation
return modified_z_score > thresh
import numpy as np
def mad_outlier(data, deviations=3.5):
"""Median Absolute Deviation Outlier Detection
Returns a masking array for if points are outliers (True) or not (False)
Args:
data (np.array): An array of data.
thresh (float, optional): How many deviations to use for masking.
Returns:
(list): Masking array
"""
# Force the dataset into columnar format
if type(data) == list:
data = np.array(data)
if len(data.shape) == 1 :
data = data[ : , None]
# Calculate the median
median = np.median(data, axis = 0)
error = np.sum(np.absolute((data - median)),axis = -1) #Absolute Error From Median , we call sum to transpose to a row
median_error = np.median(error) # Actual median of Errror
z_score = 0.6745 * error / median_error #transform each error to a deviation
return z_score > deviations # mask where the z_score > deviations
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment