Skip to content

Instantly share code, notes, and snippets.

@ipreencekmr
Last active June 19, 2019 17:30
Show Gist options
  • Save ipreencekmr/9bec379fb71147b295df663194e91cc0 to your computer and use it in GitHub Desktop.
Save ipreencekmr/9bec379fb71147b295df663194e91cc0 to your computer and use it in GitHub Desktop.
This class will return outliers present in your dataFrame based on iqr range.
#Lets check it via quartile range
from scipy.stats import iqr
import pandas as pd
import numpy as np
class OutlierDetector:
def __init__(self):
print('OutlierDetector->Init')
self.dataFrame = pd.DataFrame()
self.col_names = []
self.outlier_count = []
self.outlier_indexes = []
self.outlier_values = []
self.medians = []
self.unique_values = []
self.lower_limit = []
self.higher_limit = []
self.outlier_info_df = pd.DataFrame()
def outlier_info(self, dataFrame):
self.dataFrame = dataFrame.copy()
for col in dataFrame.columns:
quartile_range = iqr(dataFrame[col])
median = dataFrame[col].median()
q1 = dataFrame[col].quantile(0.25)
q3 = dataFrame[col].quantile(0.75)
low = q1 - (1.5 * quartile_range)
high = q3 + (1.5 * quartile_range)
outliers_df = dataFrame[(dataFrame[col] < low) | (dataFrame[col] > high)][col]
if(outliers_df.count() > 0):
self.col_names.append(col)
self.outlier_count.append(outliers_df.count())
self.outlier_indexes.append(list(outliers_df.index))
self.outlier_values.append(outliers_df.values)
self.medians.append(median)
self.unique_values.append(list(np.unique(outliers_df.values)))
self.lower_limit.append(low)
self.higher_limit.append(high)
#generating dataframe with outlier information
self.outlier_info_df = pd.DataFrame({'Attributes':self.col_names,
'Count':self.outlier_count,
'Index Locations':self.outlier_indexes,
'Outliers':self.outlier_values,
'Unique':self.unique_values,
'Lower Limit':self.lower_limit,
'Higher Limit':self.higher_limit,
'Median':self.medians})
return self.outlier_info_df;
def cap_outliers(self):
self.imputed_dict = dict()
outlier_df = self.outlier_info_df
for row_index in outlier_df.index:
outlier_row = outlier_df.iloc[row_index]
index_list = outlier_row['Index Locations']
lower_limit = outlier_row['Lower Limit']
higher_limit = outlier_row['Higher Limit']
column = outlier_row['Attributes']
imputed_list = []
previous_value_list = []
for index in index_list:
value = self.dataFrame.loc[index][column]
previous_value_list.append(value)
if value <= lower_limit:
#replace with low limit
self.dataFrame.loc[index,column] = lower_limit
imputed_list.append(lower_limit)
elif value >= higher_limit:
#replace with high limit
self.dataFrame.loc[index,column] = higher_limit
imputed_list.append(higher_limit)
imputed_df = pd.DataFrame({'Index Location':index_list,
'Previous Value':previous_value_list,
'Imputed Value':imputed_list})
self.imputed_dict[column] = imputed_df
return self.dataFrame
def print_log(self):
for key in self.imputed_dict.keys():
print(key)
print(self.imputed_dict[key],'\n')
@ipreencekmr
Copy link
Author

import OutlierFinder as OutlierFinder

outlier_detector = OutlierFinder.OutlierDetector()
outlier_detector.outlier_info(dataFrame)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment