Skip to content

Instantly share code, notes, and snippets.

@tanveer-sayyed
Created April 1, 2019 09:50
Show Gist options
  • Save tanveer-sayyed/45cfda9c4fe06243d70a6a5b66b55b7e to your computer and use it in GitHub Desktop.
Save tanveer-sayyed/45cfda9c4fe06243d70a6a5b66b55b7e to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
@author: tanveer
"""
import pandas as pd
import numpy as np
import random
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split as tts
from sklearn.metrics import classification_report, roc_curve
from sklearn.ensemble import RandomForestClassifier
import warnings
warnings.filterwarnings('ignore')
np.random.seed = 0
random.seed = 0
valueCounts = {}
def CountAll(df):
global all_columns, nanCounts, valueCounts, nanPercent
all_columns = list(df)
nanCounts = df.isnull().sum()
nanPercent = nanCounts / len(df) * 100
for x in all_columns:
valueCounts[x] = df[x].value_counts()
"""-------------------------------------------------------------------------"""
"""Random but proportional replacement(RBPR) of categoricals."""
def Fill_NaNs_Catigorical(col, df):
"""Calculating proportion."""
proportion = np.array(valueCounts[col].values) / valueCounts[col].sum() * nanCounts[col]
proportion = np.around(proportion).astype('int')
"""Adjusting proportion."""
diff = int(nanCounts[col] - np.sum(proportion))
if diff > 0:
for x in range(diff):
idx = random.randint(0, len(proportion) - 1)
proportion[idx] = proportion[idx] + 1
else:
diff = -diff
while(diff != 0):
idx = random.randint(0, len(proportion) - 1)
if proportion[idx] > 0:
proportion[idx] = proportion[idx] - 1
diff = diff - 1
"""Filling NaNs."""
nan_indexes = df[df[col].isnull()].index.tolist()
for x in range(len(proportion)):
if proportion[x] > 0:
random_subset = random.sample(population = nan_indexes, k = proportion[x])
df.loc[random_subset, col] = valueCounts[col].keys()[x]
nan_indexes = list(set(nan_indexes) - set(random_subset))
"""-------------------------------------------------------------------------"""
"""Random but proportional replacement(RBPR) of numeric"""
def Fill_NaNs_Numeric(col, df):
mini = df[col].min()
maxi = df[col].max()
"""Selecting ONLY non-NaNs."""
temp = df[df[col].notnull()][col] # type --> pd.Series
"""Any continuous data is 'always' divided into 45 bins (Hard-Coded)."""
bin_size = 45
bins = np.linspace(mini, maxi, bin_size)
"""Filling the bins (with non-NaNs) and calculating mean of each bin."""
non_NaNs_per_bin = []
mean_of_bins = []
non_NaNs_per_bin.append(len(temp[(temp <= bins[0])]))
mean_of_bins.append(temp[(temp <= bins[0])].mean())
for x in range(1, bin_size):
non_NaNs_per_bin.append(len(temp[(temp <= bins[x]) & (temp > bins[x-1])]))
mean_of_bins.append(temp[(temp <= bins[x]) & (temp > bins[x-1])].mean())
mean_of_bins = pd.Series(mean_of_bins)
# np.around() on list 'proportion' may create trouble and we may get a zero-value imputed, hence,
mean_of_bins.fillna(temp.mean(), inplace= True)
non_NaNs_per_bin = np.array(non_NaNs_per_bin)
"""Followoing part is SAME as Fill_NaNs_Catigorical()"""
"""Calculating proportion."""
proportion = np.array(non_NaNs_per_bin) / valueCounts[col].sum() * nanCounts[col]
proportion = np.around(proportion).astype('int')
"""Adjusting proportion."""
diff = int(nanCounts[col] - np.sum(proportion))
if diff > 0:
for x in range(diff):
idx = random.randint(0, len(proportion) - 1)
proportion[idx] = proportion[idx] + 1
else:
diff = -diff
while(diff != 0):
idx = random.randint(0, len(proportion) - 1)
if proportion[idx] > 0:
proportion[idx] = proportion[idx] - 1
diff = diff - 1
"""Filling NaNs."""
nan_indexes = df[df[col].isnull()].index.tolist()
for x in range(len(proportion)):
if proportion[x] > 0:
random_subset = random.sample(population= nan_indexes, k= proportion[x])
df.loc[random_subset, col] = mean_of_bins[x] # <--- Replacing with bin mean
nan_indexes = list(set(nan_indexes) - set(random_subset))
"""-------------------------------------------------------------------------"""
# Recalls
hard_setosa_recall = list()
hard_virginica_recall = list()
hard_versicolor_recall = list()
soft_setosa_recall = list()
soft_virginica_recall = list()
soft_versicolor_recall = list()
# Precision
hard_setosa_precision = list()
hard_virginica_precision = list()
hard_versicolor_precision = list()
soft_setosa_precision = list()
soft_virginica_precision = list()
soft_versicolor_precision = list()
# F1
hard_setosa_f1 = list()
hard_virginica_f1 = list()
hard_versicolor_f1 = list()
soft_setosa_f1 = list()
soft_virginica_f1 = list()
soft_versicolor_f1 = list()
# Accuracy Score
hard_score = list()
soft_score = list()
# Column names
numerical = ['sepal length (cm)','sepal width (cm)','petal length (cm)','petal width (cm)']
categorical = ['target']
no_of_attempts = 500
while(no_of_attempts != 0):
iris = load_iris()
df_original = pd.DataFrame()
df_original = pd.DataFrame(iris.data, columns=iris.feature_names)
df_original['target'] = iris.target
df_original['target'].replace(to_replace= [0,1,2], value= iris.target_names, inplace= True)
df_original['target'] = df_original['target'].astype('object')
for col in df_original.columns:
idx = random.sample(range(len(df_original)), 30) #30 unique values
df_original.loc[idx, col] = np.NaN
df_h = pd.DataFrame() # hard
df_s = pd.DataFrame() # soft
df_h = df_original.copy()
df_s = df_original.copy()
# Hard reports
for col in numerical:
df_h[col].fillna(df_h[col].mean(), inplace= True)
for col in categorical:
df_h[col].fillna(df_h[col].mode()[0], inplace= True)
X = df_h.drop(columns= 'target')
y = df_h['target']
X_train, X_test, y_train, y_test = tts(X, y, test_size= 0.3)
dtC = DecisionTreeClassifier(max_depth= 3, criterion='entropy')
dtC.fit(X_train, y_train)
y_pred = dtC.predict(X_test)
hard_score.append(dtC.score(X_test, y_test))
classif_report = classification_report(y_test, y_pred, output_dict=True)
hard_setosa_recall.append( classif_report['setosa']['recall'])
hard_virginica_recall.append( classif_report['virginica']['recall'])
hard_versicolor_recall.append( classif_report['versicolor']['recall'])
hard_setosa_precision.append( classif_report['setosa']['precision'])
hard_virginica_precision.append( classif_report['virginica']['precision'])
hard_versicolor_precision.append(classif_report['versicolor']['precision'])
hard_setosa_f1.append( classif_report['setosa']['f1-score'])
hard_virginica_f1.append( classif_report['virginica']['f1-score'])
hard_versicolor_f1.append( classif_report['versicolor']['f1-score'])
# Soft reports
CountAll(df_s)
for col in numerical:
Fill_NaNs_Numeric(col, df_s)
for col in categorical:
Fill_NaNs_Catigorical(col, df_s)
X = df_s.drop(columns= 'target')
y = df_s['target']
X_train, X_test, y_train, y_test = tts(X, y, test_size= 0.3)
dtC = DecisionTreeClassifier(max_depth= 3, criterion='entropy')
dtC.fit(X_train, y_train)
y_pred = dtC.predict(X_test)
soft_score.append(dtC.score(X_test, y_test))
classif_report = classification_report(y_test, y_pred, output_dict=True)
soft_setosa_recall.append( classif_report['setosa']['recall'])
soft_virginica_recall.append( classif_report['virginica']['recall'])
soft_versicolor_recall.append( classif_report['versicolor']['recall'])
soft_setosa_precision.append( classif_report['setosa']['precision'])
soft_virginica_precision.append( classif_report['virginica']['precision'])
soft_versicolor_precision.append(classif_report['versicolor']['precision'])
soft_setosa_f1.append( classif_report['setosa']['f1-score'])
soft_virginica_f1.append( classif_report['virginica']['f1-score'])
soft_versicolor_f1.append( classif_report['versicolor']['f1-score'])
no_of_attempts = no_of_attempts - 1
import matplotlib.pyplot as plt
fig = plt.figure(figsize=(25,10))
for i, (iris, name) in enumerate(zip([hard_setosa_recall, hard_virginica_recall, hard_versicolor_recall],
['hard_setosa_recall', 'hard_virginica_recall', 'hard_versicolor_recall'])):
plt.subplot(2,3,i+1)
plt.scatter(np.arange(0, len(iris)), iris, c=plt.cm.Paired(i/10.))
plt.plot(iris, c=plt.cm.Paired(i/10.0), linestyle='dashed')
avg = (np.array(iris)).mean()
std = (np.array(iris)).std()
var = (np.array(iris)).var()
plt.axhline(avg, c= 'orangered', label= 'avg recall: %0.2f' %(avg))
plt.axhline(std, c= 'green', label= 'std. dev.: %0.3f' %(std))
plt.axhline(var, c= 'gold', label= 'variance: %0.3f' %(var))
plt.axis('tight')
plt.xlabel('Across different 500 sets')
plt.ylabel(name)
plt.title(name.upper())
plt.legend(loc='lower right')
for i, (iris, name) in enumerate(zip([soft_setosa_recall, soft_virginica_recall, soft_versicolor_recall],
['soft_setosa_recall', 'soft_virginica_recall', 'soft_versicolor_recall'])):
plt.subplot(2,3,i+4)
plt.scatter(np.arange(0, len(iris)), iris, c=plt.cm.Paired(i/10.))
plt.plot(iris, c=plt.cm.Paired(i/10.0), linestyle='dashed')
avg = (np.array(iris)).mean()
std = (np.array(iris)).std()
var = (np.array(iris)).var()
plt.axhline(avg, c= 'orangered', label= 'avg recall: %0.2f' %(avg))
plt.axhline(std, c= 'green', label= 'std. dev.: %0.3f' %(std))
plt.axhline(var, c= 'gold', label= 'variance: %0.3f' %(var))
plt.axis('tight')
plt.xlabel('Across different 500 sets')
plt.ylabel(name)
plt.title(name.upper())
plt.legend(loc='lower right')
plt.tight_layout()
plt.show()
fig = plt.figure(figsize=(25,10))
for i, (iris, name) in enumerate(zip([hard_setosa_precision, hard_virginica_precision, hard_versicolor_precision],
['hard_setosa_precision', 'hard_virginica_precision', 'hard_versicolor_precision'])):
plt.subplot(2,3,i+1)
plt.scatter(np.arange(0, len(iris)), iris, c=plt.cm.Paired(i/10.))
plt.plot(iris, c=plt.cm.Paired(i/10.0), linestyle='dashed')
avg = (np.array(iris)).mean()
std = (np.array(iris)).std()
var = (np.array(iris)).var()
plt.axhline(avg, c= 'orangered', label= 'avg precision: %0.2f' %(avg))
plt.axhline(std, c= 'green', label= 'std. dev.: %0.3f' %(std))
plt.axhline(var, c= 'green', label= 'variance: %0.3f' %(var))
plt.axis('tight')
plt.xlabel('Across different 500 sets')
plt.ylabel(name)
plt.title(name.upper())
plt.legend(loc='lower right')
for i, (iris, name) in enumerate(zip([soft_setosa_precision, soft_virginica_precision, soft_versicolor_precision],
['soft_setosa_precision', 'soft_virginica_precision', 'soft_versicolor_precision'])):
plt.subplot(2,3,i+4)
plt.scatter(np.arange(0, len(iris)), iris, c=plt.cm.Paired(i/10.))
plt.plot(iris, c=plt.cm.Paired(i/10.0), linestyle='dashed')
avg = (np.array(iris)).mean()
std = (np.array(iris)).std()
var = (np.array(iris)).var()
plt.axhline(avg, c= 'orangered', label= 'avg precision: %0.2f' %(avg))
plt.axhline(std, c= 'green', label= 'std. dev.: %0.3f' %(std))
plt.axhline(var, c= 'gold', label= 'variance: %0.3f' %(var))
plt.axis('tight')
plt.xlabel('Across different 500 sets')
plt.ylabel(name)
plt.title(name.upper())
plt.legend(loc='lower right')
plt.tight_layout()
plt.show()
fig = plt.figure(figsize=(25,10))
for i, (iris, name) in enumerate(zip([hard_setosa_f1, hard_virginica_f1, hard_versicolor_f1],
['hard_setosa_f1', 'hard_virginica_f1', 'hard_versicolor_f1'])):
plt.subplot(2,3,i+1)
plt.scatter(np.arange(0, len(iris)), iris, c=plt.cm.Paired(i/10.))
plt.plot(iris, c=plt.cm.Paired(i/10.0), linestyle='dashed')
avg = (np.array(iris)).mean()
std = (np.array(iris)).std()
var = (np.array(iris)).var()
plt.axhline(avg, c= 'orangered', label= 'avg f1: %0.2f' %(avg))
plt.axhline(std, c= 'green', label= 'std. dev.: %0.3f' %(std))
plt.axhline(var, c= 'gold', label= 'variance: %0.3f' %(var))
plt.axis('tight')
plt.xlabel('Across different 500 sets')
plt.ylabel(name)
plt.title(name.upper())
plt.legend(loc='lower right')
for i, (iris, name) in enumerate(zip([soft_setosa_f1, soft_virginica_f1, soft_versicolor_f1],
['soft_setosa_f1', 'soft_virginica_f1', 'soft_versicolor_f1'])):
plt.subplot(2,3,i+4)
plt.scatter(np.arange(0, len(iris)), iris, c=plt.cm.Paired(i/10.))
plt.plot(iris, c=plt.cm.Paired(i/10.0), linestyle='dashed')
avg = (np.array(iris)).mean()
std = (np.array(iris)).std()
var = (np.array(iris)).var()
plt.axhline(avg, c= 'orangered', label= 'avg f1: %0.2f' %(avg))
plt.axhline(std, c= 'green', label= 'std. dev.: %0.3f' %(std))
plt.axhline(var, c= 'gold', label= 'variance: %0.3f' %(var))
plt.axis('tight')
plt.xlabel('Across different 500 sets')
plt.ylabel(name)
plt.title(name.upper())
plt.legend(loc='lower right')
plt.tight_layout()
plt.show()
fig = plt.figure(figsize=(9,10))
for i, (iris, name) in enumerate(zip([hard_score],['hard_score'])):
plt.subplot(2,1,i+1)
plt.scatter(np.arange(0, len(iris)), iris, c=plt.cm.Paired(i/10.))
plt.plot(iris, c=plt.cm.Paired(i/10.0), linestyle='dashed')
avg = (np.array(iris)).mean()
std = (np.array(iris)).std()
var = (np.array(iris)).var()
plt.axhline(avg, c= 'orangered', label= 'avg hard score: %0.2f' %(avg))
plt.axhline(std, c= 'green', label= 'std. dev.: %0.3f' %(std))
plt.axhline(var, c= 'gold', label= 'variance: %0.3f' %(var))
plt.axis('tight')
plt.xlabel('Across different 500 sets')
plt.ylabel(name)
plt.title(name.upper())
plt.legend(loc='lower right')
for i, (iris, name) in enumerate(zip([soft_score],['soft_score'])):
plt.subplot(2,1,i+2)
plt.scatter(np.arange(0, len(iris)), iris, c=plt.cm.Paired(i/10.))
plt.plot(iris, c=plt.cm.Paired(i/10.0), linestyle='dashed')
avg = (np.array(iris)).mean()
std = (np.array(iris)).std()
var = (np.array(iris)).var()
plt.axhline(avg, c= 'orangered', label= 'avg soft score: %0.2f' %(avg))
plt.axhline(std, c= 'green', label= 'std. dev.: %0.3f' %(std))
plt.axhline(var, c= 'gold', label= 'variance: %0.3f' %(var))
plt.axis('tight')
plt.xlabel('Across different 500 sets')
plt.ylabel(name)
plt.title(name.upper())
plt.legend(loc='lower right')
plt.tight_layout()
plt.show()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment