joseph-allen/outlier_detection

## outlier_detection
import numpy as np
from collections import Counter


def detect_outliers(df, n, features):
    """
    Takes a dataframe df of features and returns a list of the indices
    corresponding to the observations containing more than n outliers according
    to the Tukey method.
    """
    outlier_indices = []

    # iterate over features(columns)
    for col in features:
        # 1st quartile (25%)
        Q1 = np.percentile(df[col], 25)
        # 3rd quartile (75%)
        Q3 = np.percentile(df[col], 75)
        # Interquartile range (IQR)
        IQR = Q3 - Q1

        # outlier step
        outlier_step = 1.5 * IQR

        # Determine a list of indices of outliers for feature col
        outlier_list_col = df[(df[col] < Q1 - outlier_step) | (df[col] > Q3 + outlier_step)].index

        # append the found outlier indices for col to the list of outlier indices
        outlier_indices.extend(outlier_list_col)

    # select observations containing more than 2 outliers
    outlier_indices = Counter(outlier_indices)
    multiple_outliers = list(k for k, v in outlier_indices.items() if v > n)

    return multiple_outliers

# detect outliers from list of features
lof = ['example_feature_1', 'example_feature_2']
# params dataset, number of outliers for rejection, list of features
Outliers_to_drop = detect_outliers(dataset, 2, lof)
	import numpy as np
	from collections import Counter


	def detect_outliers(df, n, features):
	"""
	Takes a dataframe df of features and returns a list of the indices
	corresponding to the observations containing more than n outliers according
	to the Tukey method.
	"""
	outlier_indices = []

	# iterate over features(columns)
	for col in features:
	# 1st quartile (25%)
	Q1 = np.percentile(df[col], 25)
	# 3rd quartile (75%)
	Q3 = np.percentile(df[col], 75)
	# Interquartile range (IQR)
	IQR = Q3 - Q1

	# outlier step
	outlier_step = 1.5 * IQR

	# Determine a list of indices of outliers for feature col
	outlier_list_col = df[(df[col] < Q1 - outlier_step) \| (df[col] > Q3 + outlier_step)].index

	# append the found outlier indices for col to the list of outlier indices
	outlier_indices.extend(outlier_list_col)

	# select observations containing more than 2 outliers
	outlier_indices = Counter(outlier_indices)
	multiple_outliers = list(k for k, v in outlier_indices.items() if v > n)

	return multiple_outliers

	# detect outliers from list of features
	lof = ['example_feature_1', 'example_feature_2']
	# params dataset, number of outliers for rejection, list of features
	Outliers_to_drop = detect_outliers(dataset, 2, lof)