Vuong-Chu/Process_outliers.py

## Process_outliers.py
# Define function to detect outliers for numerical variables
import pandas as pd

def clean_outliers(data, types = "IQR", threshold = 3.0):
    '''
    This function will cleanse outliers only
    and leave missing values alone.

    Parameters:
        data (DataFrame): Raw data that need to detect and clean the outliers.
        types (str): Declare the method to detect outliers ("IQR" - interquartile range or "Z" - Z-score)
        threshold (floar or int): Declare the threshold when detect the outliers with Z-score.
    Returns:
        result (DataFrame): Cleaned data.
    '''

    def detect_discrete_outliers(data, types, threshold):
        data.dropna(inplace = True)

        if types == "IQR":
            Q1 = data.quantile(0.25)
            Q3 = data.quantile(0.75)
            IQR = Q3 - Q1
            mask1 = Q1 - 1.5 * IQR
            mask2 = Q3 + 1.5 * IQR
            outliers = (data < mask1) | (data > mask2)
        elif types == "Z":
            mean = data.mean()
            std = data.std()
            z_score = (data - mean)/std
            outliers = abs(z_score) > threshold
        else:
            raise Warning("Only 2 types: IQR or Z")

        return outliers

    df = data.copy()
    list_of_outliers = [False]*df.shape[0]
    for x in df:
        list_of_outliers |= detect_discrete_outliers(df[x], types, 3)
    result = df[-list_of_outliers]
    return result
	# Define function to detect outliers for numerical variables
	import pandas as pd

	def clean_outliers(data, types = "IQR", threshold = 3.0):
	'''
	This function will cleanse outliers only
	and leave missing values alone.

	Parameters:
	data (DataFrame): Raw data that need to detect and clean the outliers.
	types (str): Declare the method to detect outliers ("IQR" - interquartile range or "Z" - Z-score)
	threshold (floar or int): Declare the threshold when detect the outliers with Z-score.
	Returns:
	result (DataFrame): Cleaned data.
	'''

	def detect_discrete_outliers(data, types, threshold):
	data.dropna(inplace = True)

	if types == "IQR":
	Q1 = data.quantile(0.25)
	Q3 = data.quantile(0.75)
	IQR = Q3 - Q1
	mask1 = Q1 - 1.5 * IQR
	mask2 = Q3 + 1.5 * IQR
	outliers = (data < mask1) \| (data > mask2)
	elif types == "Z":
	mean = data.mean()
	std = data.std()
	z_score = (data - mean)/std
	outliers = abs(z_score) > threshold
	else:
	raise Warning("Only 2 types: IQR or Z")

	return outliers

	df = data.copy()
	list_of_outliers = [False]*df.shape[0]
	for x in df:
	list_of_outliers \|= detect_discrete_outliers(df[x], types, 3)
	result = df[-list_of_outliers]
	return result