Created
April 1, 2019 09:50
-
-
Save tanveer-sayyed/45cfda9c4fe06243d70a6a5b66b55b7e to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# -*- coding: utf-8 -*- | |
""" | |
@author: tanveer | |
""" | |
import pandas as pd | |
import numpy as np | |
import random | |
from sklearn.datasets import load_iris | |
from sklearn.tree import DecisionTreeClassifier | |
from sklearn.model_selection import train_test_split as tts | |
from sklearn.metrics import classification_report, roc_curve | |
from sklearn.ensemble import RandomForestClassifier | |
import warnings | |
warnings.filterwarnings('ignore') | |
np.random.seed = 0 | |
random.seed = 0 | |
valueCounts = {} | |
def CountAll(df): | |
global all_columns, nanCounts, valueCounts, nanPercent | |
all_columns = list(df) | |
nanCounts = df.isnull().sum() | |
nanPercent = nanCounts / len(df) * 100 | |
for x in all_columns: | |
valueCounts[x] = df[x].value_counts() | |
"""-------------------------------------------------------------------------""" | |
"""Random but proportional replacement(RBPR) of categoricals.""" | |
def Fill_NaNs_Catigorical(col, df): | |
"""Calculating proportion.""" | |
proportion = np.array(valueCounts[col].values) / valueCounts[col].sum() * nanCounts[col] | |
proportion = np.around(proportion).astype('int') | |
"""Adjusting proportion.""" | |
diff = int(nanCounts[col] - np.sum(proportion)) | |
if diff > 0: | |
for x in range(diff): | |
idx = random.randint(0, len(proportion) - 1) | |
proportion[idx] = proportion[idx] + 1 | |
else: | |
diff = -diff | |
while(diff != 0): | |
idx = random.randint(0, len(proportion) - 1) | |
if proportion[idx] > 0: | |
proportion[idx] = proportion[idx] - 1 | |
diff = diff - 1 | |
"""Filling NaNs.""" | |
nan_indexes = df[df[col].isnull()].index.tolist() | |
for x in range(len(proportion)): | |
if proportion[x] > 0: | |
random_subset = random.sample(population = nan_indexes, k = proportion[x]) | |
df.loc[random_subset, col] = valueCounts[col].keys()[x] | |
nan_indexes = list(set(nan_indexes) - set(random_subset)) | |
"""-------------------------------------------------------------------------""" | |
"""Random but proportional replacement(RBPR) of numeric""" | |
def Fill_NaNs_Numeric(col, df): | |
mini = df[col].min() | |
maxi = df[col].max() | |
"""Selecting ONLY non-NaNs.""" | |
temp = df[df[col].notnull()][col] # type --> pd.Series | |
"""Any continuous data is 'always' divided into 45 bins (Hard-Coded).""" | |
bin_size = 45 | |
bins = np.linspace(mini, maxi, bin_size) | |
"""Filling the bins (with non-NaNs) and calculating mean of each bin.""" | |
non_NaNs_per_bin = [] | |
mean_of_bins = [] | |
non_NaNs_per_bin.append(len(temp[(temp <= bins[0])])) | |
mean_of_bins.append(temp[(temp <= bins[0])].mean()) | |
for x in range(1, bin_size): | |
non_NaNs_per_bin.append(len(temp[(temp <= bins[x]) & (temp > bins[x-1])])) | |
mean_of_bins.append(temp[(temp <= bins[x]) & (temp > bins[x-1])].mean()) | |
mean_of_bins = pd.Series(mean_of_bins) | |
# np.around() on list 'proportion' may create trouble and we may get a zero-value imputed, hence, | |
mean_of_bins.fillna(temp.mean(), inplace= True) | |
non_NaNs_per_bin = np.array(non_NaNs_per_bin) | |
"""Followoing part is SAME as Fill_NaNs_Catigorical()""" | |
"""Calculating proportion.""" | |
proportion = np.array(non_NaNs_per_bin) / valueCounts[col].sum() * nanCounts[col] | |
proportion = np.around(proportion).astype('int') | |
"""Adjusting proportion.""" | |
diff = int(nanCounts[col] - np.sum(proportion)) | |
if diff > 0: | |
for x in range(diff): | |
idx = random.randint(0, len(proportion) - 1) | |
proportion[idx] = proportion[idx] + 1 | |
else: | |
diff = -diff | |
while(diff != 0): | |
idx = random.randint(0, len(proportion) - 1) | |
if proportion[idx] > 0: | |
proportion[idx] = proportion[idx] - 1 | |
diff = diff - 1 | |
"""Filling NaNs.""" | |
nan_indexes = df[df[col].isnull()].index.tolist() | |
for x in range(len(proportion)): | |
if proportion[x] > 0: | |
random_subset = random.sample(population= nan_indexes, k= proportion[x]) | |
df.loc[random_subset, col] = mean_of_bins[x] # <--- Replacing with bin mean | |
nan_indexes = list(set(nan_indexes) - set(random_subset)) | |
"""-------------------------------------------------------------------------""" | |
# Recalls | |
hard_setosa_recall = list() | |
hard_virginica_recall = list() | |
hard_versicolor_recall = list() | |
soft_setosa_recall = list() | |
soft_virginica_recall = list() | |
soft_versicolor_recall = list() | |
# Precision | |
hard_setosa_precision = list() | |
hard_virginica_precision = list() | |
hard_versicolor_precision = list() | |
soft_setosa_precision = list() | |
soft_virginica_precision = list() | |
soft_versicolor_precision = list() | |
# F1 | |
hard_setosa_f1 = list() | |
hard_virginica_f1 = list() | |
hard_versicolor_f1 = list() | |
soft_setosa_f1 = list() | |
soft_virginica_f1 = list() | |
soft_versicolor_f1 = list() | |
# Accuracy Score | |
hard_score = list() | |
soft_score = list() | |
# Column names | |
numerical = ['sepal length (cm)','sepal width (cm)','petal length (cm)','petal width (cm)'] | |
categorical = ['target'] | |
no_of_attempts = 500 | |
while(no_of_attempts != 0): | |
iris = load_iris() | |
df_original = pd.DataFrame() | |
df_original = pd.DataFrame(iris.data, columns=iris.feature_names) | |
df_original['target'] = iris.target | |
df_original['target'].replace(to_replace= [0,1,2], value= iris.target_names, inplace= True) | |
df_original['target'] = df_original['target'].astype('object') | |
for col in df_original.columns: | |
idx = random.sample(range(len(df_original)), 30) #30 unique values | |
df_original.loc[idx, col] = np.NaN | |
df_h = pd.DataFrame() # hard | |
df_s = pd.DataFrame() # soft | |
df_h = df_original.copy() | |
df_s = df_original.copy() | |
# Hard reports | |
for col in numerical: | |
df_h[col].fillna(df_h[col].mean(), inplace= True) | |
for col in categorical: | |
df_h[col].fillna(df_h[col].mode()[0], inplace= True) | |
X = df_h.drop(columns= 'target') | |
y = df_h['target'] | |
X_train, X_test, y_train, y_test = tts(X, y, test_size= 0.3) | |
dtC = DecisionTreeClassifier(max_depth= 3, criterion='entropy') | |
dtC.fit(X_train, y_train) | |
y_pred = dtC.predict(X_test) | |
hard_score.append(dtC.score(X_test, y_test)) | |
classif_report = classification_report(y_test, y_pred, output_dict=True) | |
hard_setosa_recall.append( classif_report['setosa']['recall']) | |
hard_virginica_recall.append( classif_report['virginica']['recall']) | |
hard_versicolor_recall.append( classif_report['versicolor']['recall']) | |
hard_setosa_precision.append( classif_report['setosa']['precision']) | |
hard_virginica_precision.append( classif_report['virginica']['precision']) | |
hard_versicolor_precision.append(classif_report['versicolor']['precision']) | |
hard_setosa_f1.append( classif_report['setosa']['f1-score']) | |
hard_virginica_f1.append( classif_report['virginica']['f1-score']) | |
hard_versicolor_f1.append( classif_report['versicolor']['f1-score']) | |
# Soft reports | |
CountAll(df_s) | |
for col in numerical: | |
Fill_NaNs_Numeric(col, df_s) | |
for col in categorical: | |
Fill_NaNs_Catigorical(col, df_s) | |
X = df_s.drop(columns= 'target') | |
y = df_s['target'] | |
X_train, X_test, y_train, y_test = tts(X, y, test_size= 0.3) | |
dtC = DecisionTreeClassifier(max_depth= 3, criterion='entropy') | |
dtC.fit(X_train, y_train) | |
y_pred = dtC.predict(X_test) | |
soft_score.append(dtC.score(X_test, y_test)) | |
classif_report = classification_report(y_test, y_pred, output_dict=True) | |
soft_setosa_recall.append( classif_report['setosa']['recall']) | |
soft_virginica_recall.append( classif_report['virginica']['recall']) | |
soft_versicolor_recall.append( classif_report['versicolor']['recall']) | |
soft_setosa_precision.append( classif_report['setosa']['precision']) | |
soft_virginica_precision.append( classif_report['virginica']['precision']) | |
soft_versicolor_precision.append(classif_report['versicolor']['precision']) | |
soft_setosa_f1.append( classif_report['setosa']['f1-score']) | |
soft_virginica_f1.append( classif_report['virginica']['f1-score']) | |
soft_versicolor_f1.append( classif_report['versicolor']['f1-score']) | |
no_of_attempts = no_of_attempts - 1 | |
import matplotlib.pyplot as plt | |
fig = plt.figure(figsize=(25,10)) | |
for i, (iris, name) in enumerate(zip([hard_setosa_recall, hard_virginica_recall, hard_versicolor_recall], | |
['hard_setosa_recall', 'hard_virginica_recall', 'hard_versicolor_recall'])): | |
plt.subplot(2,3,i+1) | |
plt.scatter(np.arange(0, len(iris)), iris, c=plt.cm.Paired(i/10.)) | |
plt.plot(iris, c=plt.cm.Paired(i/10.0), linestyle='dashed') | |
avg = (np.array(iris)).mean() | |
std = (np.array(iris)).std() | |
var = (np.array(iris)).var() | |
plt.axhline(avg, c= 'orangered', label= 'avg recall: %0.2f' %(avg)) | |
plt.axhline(std, c= 'green', label= 'std. dev.: %0.3f' %(std)) | |
plt.axhline(var, c= 'gold', label= 'variance: %0.3f' %(var)) | |
plt.axis('tight') | |
plt.xlabel('Across different 500 sets') | |
plt.ylabel(name) | |
plt.title(name.upper()) | |
plt.legend(loc='lower right') | |
for i, (iris, name) in enumerate(zip([soft_setosa_recall, soft_virginica_recall, soft_versicolor_recall], | |
['soft_setosa_recall', 'soft_virginica_recall', 'soft_versicolor_recall'])): | |
plt.subplot(2,3,i+4) | |
plt.scatter(np.arange(0, len(iris)), iris, c=plt.cm.Paired(i/10.)) | |
plt.plot(iris, c=plt.cm.Paired(i/10.0), linestyle='dashed') | |
avg = (np.array(iris)).mean() | |
std = (np.array(iris)).std() | |
var = (np.array(iris)).var() | |
plt.axhline(avg, c= 'orangered', label= 'avg recall: %0.2f' %(avg)) | |
plt.axhline(std, c= 'green', label= 'std. dev.: %0.3f' %(std)) | |
plt.axhline(var, c= 'gold', label= 'variance: %0.3f' %(var)) | |
plt.axis('tight') | |
plt.xlabel('Across different 500 sets') | |
plt.ylabel(name) | |
plt.title(name.upper()) | |
plt.legend(loc='lower right') | |
plt.tight_layout() | |
plt.show() | |
fig = plt.figure(figsize=(25,10)) | |
for i, (iris, name) in enumerate(zip([hard_setosa_precision, hard_virginica_precision, hard_versicolor_precision], | |
['hard_setosa_precision', 'hard_virginica_precision', 'hard_versicolor_precision'])): | |
plt.subplot(2,3,i+1) | |
plt.scatter(np.arange(0, len(iris)), iris, c=plt.cm.Paired(i/10.)) | |
plt.plot(iris, c=plt.cm.Paired(i/10.0), linestyle='dashed') | |
avg = (np.array(iris)).mean() | |
std = (np.array(iris)).std() | |
var = (np.array(iris)).var() | |
plt.axhline(avg, c= 'orangered', label= 'avg precision: %0.2f' %(avg)) | |
plt.axhline(std, c= 'green', label= 'std. dev.: %0.3f' %(std)) | |
plt.axhline(var, c= 'green', label= 'variance: %0.3f' %(var)) | |
plt.axis('tight') | |
plt.xlabel('Across different 500 sets') | |
plt.ylabel(name) | |
plt.title(name.upper()) | |
plt.legend(loc='lower right') | |
for i, (iris, name) in enumerate(zip([soft_setosa_precision, soft_virginica_precision, soft_versicolor_precision], | |
['soft_setosa_precision', 'soft_virginica_precision', 'soft_versicolor_precision'])): | |
plt.subplot(2,3,i+4) | |
plt.scatter(np.arange(0, len(iris)), iris, c=plt.cm.Paired(i/10.)) | |
plt.plot(iris, c=plt.cm.Paired(i/10.0), linestyle='dashed') | |
avg = (np.array(iris)).mean() | |
std = (np.array(iris)).std() | |
var = (np.array(iris)).var() | |
plt.axhline(avg, c= 'orangered', label= 'avg precision: %0.2f' %(avg)) | |
plt.axhline(std, c= 'green', label= 'std. dev.: %0.3f' %(std)) | |
plt.axhline(var, c= 'gold', label= 'variance: %0.3f' %(var)) | |
plt.axis('tight') | |
plt.xlabel('Across different 500 sets') | |
plt.ylabel(name) | |
plt.title(name.upper()) | |
plt.legend(loc='lower right') | |
plt.tight_layout() | |
plt.show() | |
fig = plt.figure(figsize=(25,10)) | |
for i, (iris, name) in enumerate(zip([hard_setosa_f1, hard_virginica_f1, hard_versicolor_f1], | |
['hard_setosa_f1', 'hard_virginica_f1', 'hard_versicolor_f1'])): | |
plt.subplot(2,3,i+1) | |
plt.scatter(np.arange(0, len(iris)), iris, c=plt.cm.Paired(i/10.)) | |
plt.plot(iris, c=plt.cm.Paired(i/10.0), linestyle='dashed') | |
avg = (np.array(iris)).mean() | |
std = (np.array(iris)).std() | |
var = (np.array(iris)).var() | |
plt.axhline(avg, c= 'orangered', label= 'avg f1: %0.2f' %(avg)) | |
plt.axhline(std, c= 'green', label= 'std. dev.: %0.3f' %(std)) | |
plt.axhline(var, c= 'gold', label= 'variance: %0.3f' %(var)) | |
plt.axis('tight') | |
plt.xlabel('Across different 500 sets') | |
plt.ylabel(name) | |
plt.title(name.upper()) | |
plt.legend(loc='lower right') | |
for i, (iris, name) in enumerate(zip([soft_setosa_f1, soft_virginica_f1, soft_versicolor_f1], | |
['soft_setosa_f1', 'soft_virginica_f1', 'soft_versicolor_f1'])): | |
plt.subplot(2,3,i+4) | |
plt.scatter(np.arange(0, len(iris)), iris, c=plt.cm.Paired(i/10.)) | |
plt.plot(iris, c=plt.cm.Paired(i/10.0), linestyle='dashed') | |
avg = (np.array(iris)).mean() | |
std = (np.array(iris)).std() | |
var = (np.array(iris)).var() | |
plt.axhline(avg, c= 'orangered', label= 'avg f1: %0.2f' %(avg)) | |
plt.axhline(std, c= 'green', label= 'std. dev.: %0.3f' %(std)) | |
plt.axhline(var, c= 'gold', label= 'variance: %0.3f' %(var)) | |
plt.axis('tight') | |
plt.xlabel('Across different 500 sets') | |
plt.ylabel(name) | |
plt.title(name.upper()) | |
plt.legend(loc='lower right') | |
plt.tight_layout() | |
plt.show() | |
fig = plt.figure(figsize=(9,10)) | |
for i, (iris, name) in enumerate(zip([hard_score],['hard_score'])): | |
plt.subplot(2,1,i+1) | |
plt.scatter(np.arange(0, len(iris)), iris, c=plt.cm.Paired(i/10.)) | |
plt.plot(iris, c=plt.cm.Paired(i/10.0), linestyle='dashed') | |
avg = (np.array(iris)).mean() | |
std = (np.array(iris)).std() | |
var = (np.array(iris)).var() | |
plt.axhline(avg, c= 'orangered', label= 'avg hard score: %0.2f' %(avg)) | |
plt.axhline(std, c= 'green', label= 'std. dev.: %0.3f' %(std)) | |
plt.axhline(var, c= 'gold', label= 'variance: %0.3f' %(var)) | |
plt.axis('tight') | |
plt.xlabel('Across different 500 sets') | |
plt.ylabel(name) | |
plt.title(name.upper()) | |
plt.legend(loc='lower right') | |
for i, (iris, name) in enumerate(zip([soft_score],['soft_score'])): | |
plt.subplot(2,1,i+2) | |
plt.scatter(np.arange(0, len(iris)), iris, c=plt.cm.Paired(i/10.)) | |
plt.plot(iris, c=plt.cm.Paired(i/10.0), linestyle='dashed') | |
avg = (np.array(iris)).mean() | |
std = (np.array(iris)).std() | |
var = (np.array(iris)).var() | |
plt.axhline(avg, c= 'orangered', label= 'avg soft score: %0.2f' %(avg)) | |
plt.axhline(std, c= 'green', label= 'std. dev.: %0.3f' %(std)) | |
plt.axhline(var, c= 'gold', label= 'variance: %0.3f' %(var)) | |
plt.axis('tight') | |
plt.xlabel('Across different 500 sets') | |
plt.ylabel(name) | |
plt.title(name.upper()) | |
plt.legend(loc='lower right') | |
plt.tight_layout() | |
plt.show() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment