Last active
June 19, 2019 17:30
-
-
Save ipreencekmr/9bec379fb71147b295df663194e91cc0 to your computer and use it in GitHub Desktop.
This class will return outliers present in your dataFrame based on iqr range.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#Lets check it via quartile range | |
from scipy.stats import iqr | |
import pandas as pd | |
import numpy as np | |
class OutlierDetector: | |
def __init__(self): | |
print('OutlierDetector->Init') | |
self.dataFrame = pd.DataFrame() | |
self.col_names = [] | |
self.outlier_count = [] | |
self.outlier_indexes = [] | |
self.outlier_values = [] | |
self.medians = [] | |
self.unique_values = [] | |
self.lower_limit = [] | |
self.higher_limit = [] | |
self.outlier_info_df = pd.DataFrame() | |
def outlier_info(self, dataFrame): | |
self.dataFrame = dataFrame.copy() | |
for col in dataFrame.columns: | |
quartile_range = iqr(dataFrame[col]) | |
median = dataFrame[col].median() | |
q1 = dataFrame[col].quantile(0.25) | |
q3 = dataFrame[col].quantile(0.75) | |
low = q1 - (1.5 * quartile_range) | |
high = q3 + (1.5 * quartile_range) | |
outliers_df = dataFrame[(dataFrame[col] < low) | (dataFrame[col] > high)][col] | |
if(outliers_df.count() > 0): | |
self.col_names.append(col) | |
self.outlier_count.append(outliers_df.count()) | |
self.outlier_indexes.append(list(outliers_df.index)) | |
self.outlier_values.append(outliers_df.values) | |
self.medians.append(median) | |
self.unique_values.append(list(np.unique(outliers_df.values))) | |
self.lower_limit.append(low) | |
self.higher_limit.append(high) | |
#generating dataframe with outlier information | |
self.outlier_info_df = pd.DataFrame({'Attributes':self.col_names, | |
'Count':self.outlier_count, | |
'Index Locations':self.outlier_indexes, | |
'Outliers':self.outlier_values, | |
'Unique':self.unique_values, | |
'Lower Limit':self.lower_limit, | |
'Higher Limit':self.higher_limit, | |
'Median':self.medians}) | |
return self.outlier_info_df; | |
def cap_outliers(self): | |
self.imputed_dict = dict() | |
outlier_df = self.outlier_info_df | |
for row_index in outlier_df.index: | |
outlier_row = outlier_df.iloc[row_index] | |
index_list = outlier_row['Index Locations'] | |
lower_limit = outlier_row['Lower Limit'] | |
higher_limit = outlier_row['Higher Limit'] | |
column = outlier_row['Attributes'] | |
imputed_list = [] | |
previous_value_list = [] | |
for index in index_list: | |
value = self.dataFrame.loc[index][column] | |
previous_value_list.append(value) | |
if value <= lower_limit: | |
#replace with low limit | |
self.dataFrame.loc[index,column] = lower_limit | |
imputed_list.append(lower_limit) | |
elif value >= higher_limit: | |
#replace with high limit | |
self.dataFrame.loc[index,column] = higher_limit | |
imputed_list.append(higher_limit) | |
imputed_df = pd.DataFrame({'Index Location':index_list, | |
'Previous Value':previous_value_list, | |
'Imputed Value':imputed_list}) | |
self.imputed_dict[column] = imputed_df | |
return self.dataFrame | |
def print_log(self): | |
for key in self.imputed_dict.keys(): | |
print(key) | |
print(self.imputed_dict[key],'\n') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
import OutlierFinder as OutlierFinder
outlier_detector = OutlierFinder.OutlierDetector()
outlier_detector.outlier_info(dataFrame)