Skip to content

Instantly share code, notes, and snippets.

@tanveer-sayyed
Last active July 24, 2019 09:44
Show Gist options
  • Save tanveer-sayyed/bf2e75e23ea0a508bbebfeadb0aafabe to your computer and use it in GitHub Desktop.
Save tanveer-sayyed/bf2e75e23ea0a508bbebfeadb0aafabe to your computer and use it in GitHub Desktop.
added n_estimators= 100 in RandomForest
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
@author: tanveer
"""
"""On Spyder editor hit F5. On jupyter-notebook paste in a single cell and press ctrl+Enter. Run atleast 15 times."""
threshold = 0.70 # TRY thresholds -> {0.72, 0.73, 0.74, 0.75}
import time
start = time.time()
valueCounts = {}
def CountAll():
global all_columns, nanCounts, valueCounts, nanPercent
all_columns = list(df)
nanCounts = df.isnull().sum()
nanPercent = nanCounts / len(df) * 100
for x in all_columns:
valueCounts[x] = df[x].value_counts()
"""-------------------------------------------------------------------------"""
"""Random but proportional replacement(RBPR) of categoricals."""
def Fill_NaNs_Catigorical(col):
"""Calculating probability and expected value."""
proportion = np.array(valueCounts[col].values) / valueCounts[col].sum() * nanCounts[col]
proportion = np.around(proportion).astype('int')
"""Adjusting proportion."""
diff = int(nanCounts[col] - np.sum(proportion))
if diff > 0:
for x in range(diff):
idx = random.randint(0, len(proportion) - 1)
proportion[idx] = proportion[idx] + 1
else:
diff = -diff
while(diff != 0):
idx = random.randint(0, len(proportion) - 1)
if proportion[idx] > 0:
proportion[idx] = proportion[idx] - 1
diff = diff - 1
"""Filling NaNs."""
nan_indexes = df[df[col].isnull()].index.tolist()
for x in range(len(proportion)):
if proportion[x] > 0:
random_subset = random.sample(population = nan_indexes, k = proportion[x])
df.loc[random_subset, col] = valueCounts[col].keys()[x]
nan_indexes = list(set(nan_indexes) - set(random_subset))
"""-------------------------------------------------------------------------"""
"""Random but proportional replacement(RBPR) of numeric"""
def Fill_NaNs_Numeric(col):
mini = df[col].min()
maxi = df[col].max()
"""Selecting ONLY non-NaNs."""
temp = df[df[col].notnull()][col] # type --> pd.Series
"""Any continuous data is 'always' divided into 45 bins (Hard-Coded)."""
bin_size = 45
bins = np.linspace(mini, maxi, bin_size)
"""Filling the bins (with non-NaNs) and calculating mean of each bin."""
non_NaNs_per_bin = []
mean_of_bins = []
non_NaNs_per_bin.append(len(temp[(temp <= bins[0])]))
mean_of_bins.append(temp[(temp <= bins[0])].mean())
for x in range(1, bin_size):
non_NaNs_per_bin.append(len(temp[(temp <= bins[x]) & (temp > bins[x-1])]))
mean_of_bins.append(temp[(temp <= bins[x]) & (temp > bins[x-1])].mean())
mean_of_bins = pd.Series(mean_of_bins)
# np.around() on list 'proportion' may create trouble and we may get a zero-value imputed, hence,
mean_of_bins.fillna(temp.mean(), inplace= True)
non_NaNs_per_bin = np.array(non_NaNs_per_bin)
"""Followoing part is SAME as Fill_NaNs_Catigorical()"""
"""Calculating probability and expected value."""
proportion = np.array(non_NaNs_per_bin) / valueCounts[col].sum() * nanCounts[col]
proportion = np.around(proportion).astype('int')
"""Adjusting proportion."""
diff = int(nanCounts[col] - np.sum(proportion))
if diff > 0:
for x in range(diff):
idx = random.randint(0, len(proportion) - 1)
proportion[idx] = proportion[idx] + 1
else:
diff = -diff
while(diff != 0):
idx = random.randint(0, len(proportion) - 1)
if proportion[idx] > 0:
proportion[idx] = proportion[idx] - 1
diff = diff - 1
"""Filling NaNs."""
nan_indexes = df[df[col].isnull()].index.tolist()
for x in range(len(proportion)):
if proportion[x] > 0:
random_subset = random.sample(population= nan_indexes, k= proportion[x])
df.loc[random_subset, col] = mean_of_bins[x] # <--- Replacing with bin mean
nan_indexes = list(set(nan_indexes) - set(random_subset))
"""-------------------------------------------------------------------------"""
import pandas as pd
import numpy as np
import random
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split as tts
from sklearn.metrics import classification_report
from yellowbrick.classifier import PrecisionRecallCurve
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
# Important so that results are reproducible
np.random.seed = 0
random.seed = 0
""" STEP-1 """
iris = load_iris()
# Already free of impurities so .copy() not required
df = pd.DataFrame(iris.data, columns=iris.feature_names)
df['target'] = iris.target
df['target'].replace(to_replace= [0,1,2], value= iris.target_names, inplace= True)
df['target'] = df['target'].astype('object')
# Removing 20% values from each column
idx_sepal_length = list(random.sample(range(len(df)), 30))
idx_sepal_width = list(random.sample(range(len(df)), 30))
idx_petal_length = list(random.sample(range(len(df)), 30))
idx_petal_width = list(random.sample(range(len(df)), 30))
idx_target = list(random.sample(range(len(df)), 30))
df.loc[idx_sepal_length, 'sepal length (cm)'] = np.NaN
df.loc[idx_sepal_width, 'sepal width (cm)'] = np.NaN
df.loc[idx_petal_length, 'petal length (cm)'] = np.NaN
df.loc[idx_petal_width, 'petal width (cm)'] = np.NaN
df.loc[idx_target, 'target'] = np.NaN
""" STEP-2 """
CountAll()
""" STEP-3 """
numerical = list(df.columns[df.dtypes == np.number])
""" STEP-4 """
categorical = list(df.columns[df.dtypes != np.number])
""" STEP-5 """
original_mean = pd.Series()
imputed_mean = pd.Series()
for col in numerical:
original_mean[col] = df[col].mean()
Fill_NaNs_Numeric(col)
imputed_mean[col] = df[col].mean()
print()
print('% change in mean:')
print((original_mean - imputed_mean)/original_mean*100)
print()
del original_mean, imputed_mean
""" STEP-6 """
for col in categorical:
print('Mode before imputing target: ', df[col].mode()[0])
Fill_NaNs_Catigorical(col)
print('Mode after imputing target: ', df[col].mode()[0])
""" STEP-7 """
X = df.drop(columns= 'target')
y = df['target']
X_train, X_test, y_train, y_test = tts(X, y, test_size= 0.3) # no random_state specified
dtC = DecisionTreeClassifier(max_depth= 3, criterion='entropy')
dtC.fit(X_train, y_train)
y_pred = dtC.predict(X_test)
#print(classification_report(y_test, y_pred, output_dict=False))
classif_report = classification_report(y_test, y_pred, output_dict=True)
print('\n::BEFORE tuning (test scores for DecisionTreeClassifier)')
print('setosa recall: ', classif_report['setosa']['recall'])
print('virginica recall: ', classif_report['virginica']['recall'])
print('versicolor recall: ', classif_report['versicolor']['recall'])
""" STEP-8 """
condition = 1
no_of_attempts = 0
while(not((classif_report['setosa']['recall'] >= threshold) and
(classif_report['virginica']['recall'] >= threshold) and
(classif_report['versicolor']['recall'] >= threshold))):
df.loc[idx_sepal_length, 'sepal length (cm)'] = np.NaN
df.loc[idx_sepal_width, 'sepal width (cm)'] = np.NaN
df.loc[idx_petal_length, 'petal length (cm)'] = np.NaN
df.loc[idx_petal_width, 'petal width (cm)'] = np.NaN
df.loc[idx_target, 'target'] = np.NaN
CountAll()
for col in numerical:
Fill_NaNs_Numeric(col)
for col in categorical:
Fill_NaNs_Catigorical(col)
X = df.drop(columns= 'target')
y = df['target']
X_train, X_test, y_train, y_test = tts(X, y, test_size= 0.3) # no random_state specified
dtC = DecisionTreeClassifier(max_depth= 3, criterion='entropy')
dtC.fit(X_train, y_train)
y_pred = dtC.predict(X_test)
classif_report = classification_report(y_test, y_pred, output_dict=True)
if no_of_attempts == 40:
condition = 0
break
no_of_attempts = no_of_attempts + 1
if condition:
print('\n::AFTER tuning (test scores for DecisionTreeClassifier)')
print('setosa recall: ', classif_report['setosa']['recall'])
print('virginica recall: ', classif_report['virginica']['recall'])
print('versicolor recall: ', classif_report['versicolor']['recall'])
print('\nno of attempts in while loop: ', no_of_attempts)
fig = plt.figure(figsize=(8,8))
X = df.drop(columns= 'target')
y = df['target']
y, uniques = pd.factorize(y)
X_train, X_test, y_train, y_test = tts(X, y, test_size= 0.3) # no random_state specified
viz = PrecisionRecallCurve(RandomForestClassifier(n_estimators= 100), per_class=True, iso_f1_curves=True, fill_area=False, micro=False)
viz.fit(X_train, y_train)
viz.score(X_test, y_test)
viz.poof()
print('(Above curves plotted after one more train test split)')
else:
print("\nCondition unreachable in 40 loops.")
end = time.time()
print('\nTime taken: ', str(end-start))
del X, X_test, X_train, all_columns, categorical, classif_report, col, df, end
del iris, nanCounts, nanPercent, no_of_attempts, numerical, start, valueCounts
del idx_sepal_length, idx_sepal_width, idx_petal_length, idx_petal_width, idx_target, threshold
del y, y_pred, y_test, y_train, condition
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment