ksv-muralidhar/outlier_detect_1.py

## outlier_detect_1.py
from sklearn.datasets import load_wine
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
data = pd.DataFrame(load_wine()["data"],columns=load_wine()["feature_names"])
data.head()

## outlier_detect_2.py
data.plot(kind="box",subplots=True,layout=(7,2),figsize=(15,20));

## outlier_detect_3.py
#FUNCTION TO IDENTIFY OUTLIERS USING IQR METHOD
def iqr_outlier(x,factor):
    q1 = x.quantile(0.25)
    q3 = x.quantile(0.75)
    iqr = q3 - q1
    min_ = q1 - factor * iqr
    max_ = q3 + factor * iqr
    result_ = pd.Series([0] * len(x))
    result_[((x < min_) | (x > max_))] = 1
    return result_
#SCATTER PLOTS HIGHLIGHTING OUTLIERS CALCULATED USING IQR METHOD
fig, ax = plt.subplots(7, 2, figsize=(20, 30))
row = col = 0
for n,i in enumerate(data.columns):
    if (n % 2 == 0) & (n > 0):
        row += 1
        col = 0
    outliers = iqr_outlier(data[i], 1.5)

    if sum(outliers) == 0:
        sns.scatterplot(x = np.arange(len(data[i])), y = data[i], ax = ax[row, col], legend=False, color = 'green')
    else:
        sns.scatterplot(x = np.arange(len(data[i])), y = data[i], ax = ax[row, col], hue = outliers, palette = ['green','red'])
    for x,y in zip(np.arange(len(data[i]))[outliers == 1], data[i][outliers == 1]):
        ax[row,col].text(x = x, y = y, s = y, fontsize = 8)
    ax[row,col].set_ylabel("")
    ax[row,col].set_title(i)
    ax[row,col].xaxis.set_visible(False)
    if sum(outliers) > 0:
        ax[row,col].legend(ncol=2)
    col += 1
ax[row,col].axis('off')
plt.show()

## outlier_detect_4.py
#FUNCTION TO DETECT OUTLIERS USING Z-SCORE METHOD
def zscore_outlier(x,lb,ub):
    zscore = ((x - x.mean()) / x.std()).copy()
    result_ = pd.Series([0] * len(x))
    result_[((zscore < lb) | (zscore > ub))] = 1
    return result_
#PLOTTING A SCATTER PLOT AND HIGHLIGHTING THE OUTLIERS DETECTED BY Z-SCORE METHOD
fig, ax = plt.subplots(7, 2, figsize=(20, 30))
row = col = 0
for n,i in enumerate(data.columns):
    if (n % 2 == 0) & (n > 0):
        row += 1
        col = 0
    outliers = zscore_outlier(data[i], -3, 3)

    if sum(outliers) == 0:
        sns.scatterplot(x = np.arange(len(data[i])), y = data[i], ax = ax[row, col], legend=False, color = 'green')
    else:
        sns.scatterplot(x = np.arange(len(data[i])), y = data[i], ax = ax[row, col], hue = outliers, palette = ['green','red'])
    for x,y in zip(np.arange(len(data[i]))[outliers == 1], data[i][outliers == 1]):
        ax[row,col].text(x = x, y = y, s = y, fontsize = 8)
    ax[row,col].set_ylabel("")
    ax[row,col].set_title(i)
    ax[row,col].xaxis.set_visible(False)
    if sum(outliers) > 0:
        ax[row,col].legend(ncol=2)
    col += 1
ax[row,col].axis('off')
plt.show()

## outlier_detect_5.py
def euclidean_distance_outlier(x,cutoff):
    result_ = pd.Series([0] * len(x))
    data_mean = x.mean() # mean of data
    dist = np.sqrt(np.sum(((x-data_mean) ** 2),axis=1)) #Euclidean distande
    dist_mean = dist.mean() #mean of the distances
    dist_zscore = np.abs((dist - dist_mean) / dist.std())#z-score of the distances
    result_[((dist_zscore > cutoff))] = 1
    return result_
euc_d = data[["malic_acid","magnesium"]].copy()
d['outlier'] = euclidean_distance(d,3)
sns.scatterplot(x="malic_acid",y="magnesium",data=d,hue="outlier",palette=["green","red"])
	from sklearn.datasets import load_wine
	import pandas as pd
	import numpy as np
	import matplotlib.pyplot as plt
	import seaborn as sns
	data = pd.DataFrame(load_wine()["data"],columns=load_wine()["feature_names"])
	data.head()
	#FUNCTION TO IDENTIFY OUTLIERS USING IQR METHOD
	def iqr_outlier(x,factor):
	q1 = x.quantile(0.25)
	q3 = x.quantile(0.75)
	iqr = q3 - q1
	min_ = q1 - factor * iqr
	max_ = q3 + factor * iqr
	result_ = pd.Series([0] * len(x))
	result_[((x < min_) \| (x > max_))] = 1
	return result_
	#SCATTER PLOTS HIGHLIGHTING OUTLIERS CALCULATED USING IQR METHOD
	fig, ax = plt.subplots(7, 2, figsize=(20, 30))
	row = col = 0
	for n,i in enumerate(data.columns):
	if (n % 2 == 0) & (n > 0):
	row += 1
	col = 0
	outliers = iqr_outlier(data[i], 1.5)

	if sum(outliers) == 0:
	sns.scatterplot(x = np.arange(len(data[i])), y = data[i], ax = ax[row, col], legend=False, color = 'green')
	else:
	sns.scatterplot(x = np.arange(len(data[i])), y = data[i], ax = ax[row, col], hue = outliers, palette = ['green','red'])
	for x,y in zip(np.arange(len(data[i]))[outliers == 1], data[i][outliers == 1]):
	ax[row,col].text(x = x, y = y, s = y, fontsize = 8)
	ax[row,col].set_ylabel("")
	ax[row,col].set_title(i)
	ax[row,col].xaxis.set_visible(False)
	if sum(outliers) > 0:
	ax[row,col].legend(ncol=2)
	col += 1
	ax[row,col].axis('off')
	plt.show()
	#FUNCTION TO DETECT OUTLIERS USING Z-SCORE METHOD
	def zscore_outlier(x,lb,ub):
	zscore = ((x - x.mean()) / x.std()).copy()
	result_ = pd.Series([0] * len(x))
	result_[((zscore < lb) \| (zscore > ub))] = 1
	return result_
	#PLOTTING A SCATTER PLOT AND HIGHLIGHTING THE OUTLIERS DETECTED BY Z-SCORE METHOD
	fig, ax = plt.subplots(7, 2, figsize=(20, 30))
	row = col = 0
	for n,i in enumerate(data.columns):
	if (n % 2 == 0) & (n > 0):
	row += 1
	col = 0
	outliers = zscore_outlier(data[i], -3, 3)

	if sum(outliers) == 0:
	sns.scatterplot(x = np.arange(len(data[i])), y = data[i], ax = ax[row, col], legend=False, color = 'green')
	else:
	sns.scatterplot(x = np.arange(len(data[i])), y = data[i], ax = ax[row, col], hue = outliers, palette = ['green','red'])
	for x,y in zip(np.arange(len(data[i]))[outliers == 1], data[i][outliers == 1]):
	ax[row,col].text(x = x, y = y, s = y, fontsize = 8)
	ax[row,col].set_ylabel("")
	ax[row,col].set_title(i)
	ax[row,col].xaxis.set_visible(False)
	if sum(outliers) > 0:
	ax[row,col].legend(ncol=2)
	col += 1
	ax[row,col].axis('off')
	plt.show()
	def euclidean_distance_outlier(x,cutoff):
	result_ = pd.Series([0] * len(x))
	data_mean = x.mean() # mean of data
	dist = np.sqrt(np.sum(((x-data_mean) ** 2),axis=1)) #Euclidean distande
	dist_mean = dist.mean() #mean of the distances
	dist_zscore = np.abs((dist - dist_mean) / dist.std())#z-score of the distances
	result_[((dist_zscore > cutoff))] = 1
	return result_
	euc_d = data[["malic_acid","magnesium"]].copy()
	d['outlier'] = euclidean_distance(d,3)
	sns.scatterplot(x="malic_acid",y="magnesium",data=d,hue="outlier",palette=["green","red"])