dylanbstorey/malhalanobis.py

## malhalanobis.py
def is_outlier(list, threshold=3.5):
    """
    Returns a boolean array with True if points are outliers and False
    otherwise.

    Parameters:
    -----------
        points : An numobservations by numdimensions array of observations
        thresh : The modified z-score to use as a threshold. Observations with
            a modified z-score (based on the median absolute deviation) greater
            than this value will be classified as outliers.

    Returns:
    --------
        mask : A numobservations-length boolean array.

    References:
    ----------
        Boris Iglewicz and David Hoaglin (1993), "Volume 16: How to Detect and
        Handle Outliers", The ASQC Basic References in Quality Control:
        Statistical Techniques, Edward F. Mykytka, Ph.D., Editor.
    """

    if len(points.shape) == 1:
        points = points[:,None]
    median = np.median(points, axis=0)
    diff = np.sum((points - median)**2, axis=-1)
    diff = np.sqrt(diff)
    med_abs_deviation = np.median(diff)

    modified_z_score = 0.6745 * diff / med_abs_deviation

    return modified_z_score > thresh

## median_absolute_deviation.py
import numpy as np

def mad_outlier(data, deviations=3.5):
    """Median Absolute Deviation Outlier Detection

    Returns a masking array for if points are outliers (True) or not (False)

    Args:
        data (np.array): An array of data.
        thresh (float, optional): How many deviations to use for masking.

    Returns:
         (list): Masking array
    """

    # Force the dataset into columnar format
    if type(data) == list:
        data = np.array(data)
    if len(data.shape) == 1 :
        data = data[ : , None]

    # Calculate the median
    median = np.median(data, axis = 0)

    error = np.sum(np.absolute((data - median)),axis = -1) #Absolute Error From Median , we call sum to transpose to a row

    median_error = np.median(error) # Actual median of Errror

    z_score = 0.6745 * error / median_error #transform each error to a deviation

    return z_score > deviations # mask where the z_score > deviations
	def is_outlier(list, threshold=3.5):
	"""
	Returns a boolean array with True if points are outliers and False
	otherwise.

	Parameters:
	-----------
	points : An numobservations by numdimensions array of observations
	thresh : The modified z-score to use as a threshold. Observations with
	a modified z-score (based on the median absolute deviation) greater
	than this value will be classified as outliers.

	Returns:
	--------
	mask : A numobservations-length boolean array.

	References:
	----------
	Boris Iglewicz and David Hoaglin (1993), "Volume 16: How to Detect and
	Handle Outliers", The ASQC Basic References in Quality Control:
	Statistical Techniques, Edward F. Mykytka, Ph.D., Editor.
	"""

	if len(points.shape) == 1:
	points = points[:,None]
	median = np.median(points, axis=0)
	diff = np.sum((points - median)**2, axis=-1)
	diff = np.sqrt(diff)
	med_abs_deviation = np.median(diff)

	modified_z_score = 0.6745 * diff / med_abs_deviation

	return modified_z_score > thresh
	import numpy as np

	def mad_outlier(data, deviations=3.5):
	"""Median Absolute Deviation Outlier Detection

	Returns a masking array for if points are outliers (True) or not (False)

	Args:
	data (np.array): An array of data.
	thresh (float, optional): How many deviations to use for masking.

	Returns:
	(list): Masking array
	"""

	# Force the dataset into columnar format
	if type(data) == list:
	data = np.array(data)
	if len(data.shape) == 1 :
	data = data[ : , None]

	# Calculate the median
	median = np.median(data, axis = 0)

	error = np.sum(np.absolute((data - median)),axis = -1) #Absolute Error From Median , we call sum to transpose to a row

	median_error = np.median(error) # Actual median of Errror

	z_score = 0.6745 * error / median_error #transform each error to a deviation

	return z_score > deviations # mask where the z_score > deviations