Skip to content

Instantly share code, notes, and snippets.

@ksv-muralidhar
Created February 16, 2021 03:24
Show Gist options
  • Save ksv-muralidhar/9bd104692870f740565085a354a2e118 to your computer and use it in GitHub Desktop.
Save ksv-muralidhar/9bd104692870f740565085a354a2e118 to your computer and use it in GitHub Desktop.
outlier detection
from sklearn.datasets import load_wine
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
data = pd.DataFrame(load_wine()["data"],columns=load_wine()["feature_names"])
data.head()
data.plot(kind="box",subplots=True,layout=(7,2),figsize=(15,20));
#FUNCTION TO IDENTIFY OUTLIERS USING IQR METHOD
def iqr_outlier(x,factor):
q1 = x.quantile(0.25)
q3 = x.quantile(0.75)
iqr = q3 - q1
min_ = q1 - factor * iqr
max_ = q3 + factor * iqr
result_ = pd.Series([0] * len(x))
result_[((x < min_) | (x > max_))] = 1
return result_
#SCATTER PLOTS HIGHLIGHTING OUTLIERS CALCULATED USING IQR METHOD
fig, ax = plt.subplots(7, 2, figsize=(20, 30))
row = col = 0
for n,i in enumerate(data.columns):
if (n % 2 == 0) & (n > 0):
row += 1
col = 0
outliers = iqr_outlier(data[i], 1.5)
if sum(outliers) == 0:
sns.scatterplot(x = np.arange(len(data[i])), y = data[i], ax = ax[row, col], legend=False, color = 'green')
else:
sns.scatterplot(x = np.arange(len(data[i])), y = data[i], ax = ax[row, col], hue = outliers, palette = ['green','red'])
for x,y in zip(np.arange(len(data[i]))[outliers == 1], data[i][outliers == 1]):
ax[row,col].text(x = x, y = y, s = y, fontsize = 8)
ax[row,col].set_ylabel("")
ax[row,col].set_title(i)
ax[row,col].xaxis.set_visible(False)
if sum(outliers) > 0:
ax[row,col].legend(ncol=2)
col += 1
ax[row,col].axis('off')
plt.show()
#FUNCTION TO DETECT OUTLIERS USING Z-SCORE METHOD
def zscore_outlier(x,lb,ub):
zscore = ((x - x.mean()) / x.std()).copy()
result_ = pd.Series([0] * len(x))
result_[((zscore < lb) | (zscore > ub))] = 1
return result_
#PLOTTING A SCATTER PLOT AND HIGHLIGHTING THE OUTLIERS DETECTED BY Z-SCORE METHOD
fig, ax = plt.subplots(7, 2, figsize=(20, 30))
row = col = 0
for n,i in enumerate(data.columns):
if (n % 2 == 0) & (n > 0):
row += 1
col = 0
outliers = zscore_outlier(data[i], -3, 3)
if sum(outliers) == 0:
sns.scatterplot(x = np.arange(len(data[i])), y = data[i], ax = ax[row, col], legend=False, color = 'green')
else:
sns.scatterplot(x = np.arange(len(data[i])), y = data[i], ax = ax[row, col], hue = outliers, palette = ['green','red'])
for x,y in zip(np.arange(len(data[i]))[outliers == 1], data[i][outliers == 1]):
ax[row,col].text(x = x, y = y, s = y, fontsize = 8)
ax[row,col].set_ylabel("")
ax[row,col].set_title(i)
ax[row,col].xaxis.set_visible(False)
if sum(outliers) > 0:
ax[row,col].legend(ncol=2)
col += 1
ax[row,col].axis('off')
plt.show()
def euclidean_distance_outlier(x,cutoff):
result_ = pd.Series([0] * len(x))
data_mean = x.mean() # mean of data
dist = np.sqrt(np.sum(((x-data_mean) ** 2),axis=1)) #Euclidean distande
dist_mean = dist.mean() #mean of the distances
dist_zscore = np.abs((dist - dist_mean) / dist.std())#z-score of the distances
result_[((dist_zscore > cutoff))] = 1
return result_
euc_d = data[["malic_acid","magnesium"]].copy()
d['outlier'] = euclidean_distance(d,3)
sns.scatterplot(x="malic_acid",y="magnesium",data=d,hue="outlier",palette=["green","red"])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment