Skip to content

Instantly share code, notes, and snippets.

@Vuong-Chu
Last active August 31, 2023 18:28
Show Gist options
  • Save Vuong-Chu/59524d82bc66de160913f28934e2a815 to your computer and use it in GitHub Desktop.
Save Vuong-Chu/59524d82bc66de160913f28934e2a815 to your computer and use it in GitHub Desktop.
This function is to remove outliers in columns of a dataframe and ignore missing values that may be processed in following steps.
# Define function to detect outliers for numerical variables
import pandas as pd
def clean_outliers(data, types = "IQR", threshold = 3.0):
'''
This function will cleanse outliers only
and leave missing values alone.
Parameters:
data (DataFrame): Raw data that need to detect and clean the outliers.
types (str): Declare the method to detect outliers ("IQR" - interquartile range or "Z" - Z-score)
threshold (floar or int): Declare the threshold when detect the outliers with Z-score.
Returns:
result (DataFrame): Cleaned data.
'''
def detect_discrete_outliers(data, types, threshold):
data.dropna(inplace = True)
if types == "IQR":
Q1 = data.quantile(0.25)
Q3 = data.quantile(0.75)
IQR = Q3 - Q1
mask1 = Q1 - 1.5 * IQR
mask2 = Q3 + 1.5 * IQR
outliers = (data < mask1) | (data > mask2)
elif types == "Z":
mean = data.mean()
std = data.std()
z_score = (data - mean)/std
outliers = abs(z_score) > threshold
else:
raise Warning("Only 2 types: IQR or Z")
return outliers
df = data.copy()
list_of_outliers = [False]*df.shape[0]
for x in df:
list_of_outliers |= detect_discrete_outliers(df[x], types, 3)
result = df[-list_of_outliers]
return result
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment