tanveer-sayyed/Soft Imputation on Iris Dataset.py

## Soft Imputation on Iris Dataset.py
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
@author: tanveer
"""

"""On Spyder editor hit F5. On jupyter-notebook paste in a single cell and press ctrl+Enter. Run atleast 15 times."""
threshold = 0.70 # TRY thresholds -> {0.72, 0.73, 0.74, 0.75}

import time
start = time.time()

valueCounts = {}
def CountAll():
    global all_columns, nanCounts, valueCounts, nanPercent
    all_columns = list(df)
    nanCounts = df.isnull().sum()
    nanPercent = nanCounts / len(df) * 100
    for x in all_columns:
        valueCounts[x] = df[x].value_counts()

"""-------------------------------------------------------------------------"""

"""Random but proportional replacement(RBPR) of categoricals."""
def Fill_NaNs_Catigorical(col):
    """Calculating probability and expected value."""
    proportion = np.array(valueCounts[col].values) / valueCounts[col].sum() * nanCounts[col]
    proportion = np.around(proportion).astype('int')

    """Adjusting proportion."""
    diff = int(nanCounts[col] - np.sum(proportion))
    if diff > 0:
        for x in range(diff):
            idx = random.randint(0, len(proportion) - 1)
            proportion[idx] =  proportion[idx] + 1
    else:
        diff = -diff
        while(diff != 0):
            idx = random.randint(0, len(proportion) - 1)
            if proportion[idx] > 0:
                proportion[idx] =  proportion[idx] - 1
                diff = diff - 1

    """Filling NaNs."""
    nan_indexes = df[df[col].isnull()].index.tolist()
    for x in range(len(proportion)):
        if proportion[x] > 0:
            random_subset = random.sample(population = nan_indexes, k = proportion[x])
            df.loc[random_subset, col] = valueCounts[col].keys()[x]
            nan_indexes = list(set(nan_indexes) - set(random_subset))

"""-------------------------------------------------------------------------"""

"""Random but proportional replacement(RBPR) of numeric"""
def Fill_NaNs_Numeric(col):

    mini = df[col].min()
    maxi = df[col].max()
    """Selecting ONLY non-NaNs."""
    temp = df[df[col].notnull()][col] # type --> pd.Series

    """Any continuous data is 'always' divided into 45 bins (Hard-Coded)."""
    bin_size = 45
    bins = np.linspace(mini, maxi, bin_size)

    """Filling the bins (with non-NaNs) and calculating mean of each bin."""
    non_NaNs_per_bin = []
    mean_of_bins = []

    non_NaNs_per_bin.append(len(temp[(temp <= bins[0])]))
    mean_of_bins.append(temp[(temp <= bins[0])].mean())
    for x in range(1, bin_size):
        non_NaNs_per_bin.append(len(temp[(temp <= bins[x]) & (temp > bins[x-1])]))
        mean_of_bins.append(temp[(temp <= bins[x]) & (temp > bins[x-1])].mean())

    mean_of_bins = pd.Series(mean_of_bins)
    # np.around() on  list 'proportion' may create trouble and we may get a zero-value imputed, hence,
    mean_of_bins.fillna(temp.mean(), inplace= True)
    non_NaNs_per_bin = np.array(non_NaNs_per_bin)

    """Followoing part is SAME as Fill_NaNs_Catigorical()"""

    """Calculating probability and expected value."""
    proportion = np.array(non_NaNs_per_bin) / valueCounts[col].sum() * nanCounts[col]
    proportion = np.around(proportion).astype('int')

    """Adjusting proportion."""
    diff = int(nanCounts[col] - np.sum(proportion))
    if diff > 0:
        for x in range(diff):
            idx = random.randint(0, len(proportion) - 1)
            proportion[idx] =  proportion[idx] + 1
    else:
        diff = -diff
        while(diff != 0):
            idx = random.randint(0, len(proportion) - 1)
            if proportion[idx] > 0:
                proportion[idx] =  proportion[idx] - 1
                diff = diff - 1

    """Filling NaNs."""
    nan_indexes = df[df[col].isnull()].index.tolist()
    for x in range(len(proportion)):
            if proportion[x] > 0:
                random_subset = random.sample(population= nan_indexes, k= proportion[x])
                df.loc[random_subset, col] = mean_of_bins[x] # <--- Replacing with bin mean
                nan_indexes = list(set(nan_indexes) - set(random_subset))

"""-------------------------------------------------------------------------"""

import pandas as pd
import numpy as np
import random
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split as tts
from sklearn.metrics import classification_report
from yellowbrick.classifier import PrecisionRecallCurve
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

# Important so that results are reproducible
np.random.seed = 0
random.seed = 0


""" STEP-1 """
iris = load_iris()
# Already free of impurities so .copy() not required
df = pd.DataFrame(iris.data, columns=iris.feature_names)
df['target'] = iris.target
df['target'].replace(to_replace= [0,1,2], value= iris.target_names, inplace= True)
df['target'] = df['target'].astype('object')

# Removing 20% values from each column
idx_sepal_length = list(random.sample(range(len(df)), 30))
idx_sepal_width =  list(random.sample(range(len(df)), 30))
idx_petal_length = list(random.sample(range(len(df)), 30))
idx_petal_width =  list(random.sample(range(len(df)), 30))
idx_target =       list(random.sample(range(len(df)), 30))
df.loc[idx_sepal_length, 'sepal length (cm)'] = np.NaN
df.loc[idx_sepal_width,  'sepal width (cm)']  = np.NaN
df.loc[idx_petal_length, 'petal length (cm)'] = np.NaN
df.loc[idx_petal_width,  'petal width (cm)']  = np.NaN
df.loc[idx_target,       'target']            = np.NaN


""" STEP-2 """
CountAll()


""" STEP-3 """
numerical = list(df.columns[df.dtypes == np.number])


""" STEP-4 """
categorical = list(df.columns[df.dtypes != np.number])


""" STEP-5 """
original_mean = pd.Series()
imputed_mean = pd.Series()
for col in numerical:
    original_mean[col] = df[col].mean()
    Fill_NaNs_Numeric(col)
    imputed_mean[col] = df[col].mean()
print()
print('% change in mean:')
print((original_mean - imputed_mean)/original_mean*100)
print()
del original_mean, imputed_mean


""" STEP-6 """
for col in categorical:
    print('Mode before imputing target: ', df[col].mode()[0])
    Fill_NaNs_Catigorical(col)
    print('Mode after imputing target: ', df[col].mode()[0])


""" STEP-7 """
X = df.drop(columns= 'target')
y = df['target']
X_train, X_test, y_train, y_test = tts(X, y, test_size= 0.3) # no random_state specified
dtC = DecisionTreeClassifier(max_depth= 3, criterion='entropy')
dtC.fit(X_train, y_train)
y_pred = dtC.predict(X_test)

#print(classification_report(y_test, y_pred, output_dict=False))
classif_report = classification_report(y_test, y_pred, output_dict=True)
print('\n::BEFORE tuning (test scores for DecisionTreeClassifier)')
print('setosa recall: ',     classif_report['setosa']['recall'])
print('virginica recall: ',  classif_report['virginica']['recall'])
print('versicolor recall: ', classif_report['versicolor']['recall'])


""" STEP-8 """
condition = 1
no_of_attempts = 0
while(not((classif_report['setosa']['recall'] >=     threshold) and
          (classif_report['virginica']['recall'] >=  threshold) and
          (classif_report['versicolor']['recall'] >= threshold))):

    df.loc[idx_sepal_length, 'sepal length (cm)'] = np.NaN
    df.loc[idx_sepal_width,  'sepal width (cm)']  = np.NaN
    df.loc[idx_petal_length, 'petal length (cm)'] = np.NaN
    df.loc[idx_petal_width,  'petal width (cm)']  = np.NaN
    df.loc[idx_target,       'target']            = np.NaN

    CountAll()
    for col in numerical:
        Fill_NaNs_Numeric(col)
    for col in categorical:
        Fill_NaNs_Catigorical(col)

    X = df.drop(columns= 'target')
    y = df['target']
    X_train, X_test, y_train, y_test = tts(X, y, test_size= 0.3) # no random_state specified
    dtC = DecisionTreeClassifier(max_depth= 3, criterion='entropy')
    dtC.fit(X_train, y_train)
    y_pred = dtC.predict(X_test)

    classif_report = classification_report(y_test, y_pred, output_dict=True)

    if no_of_attempts == 40:
        condition = 0
        break

    no_of_attempts = no_of_attempts + 1

if condition:
    print('\n::AFTER tuning (test scores for DecisionTreeClassifier)')
    print('setosa recall: ',     classif_report['setosa']['recall'])
    print('virginica recall: ',  classif_report['virginica']['recall'])
    print('versicolor recall: ', classif_report['versicolor']['recall'])
    print('\nno of attempts in while loop: ', no_of_attempts)

    fig = plt.figure(figsize=(8,8))
    X = df.drop(columns= 'target')
    y = df['target']
    y, uniques = pd.factorize(y)
    X_train, X_test, y_train, y_test = tts(X, y, test_size= 0.3) # no random_state specified
    viz = PrecisionRecallCurve(RandomForestClassifier(n_estimators= 100), per_class=True, iso_f1_curves=True, fill_area=False, micro=False)
    viz.fit(X_train, y_train)
    viz.score(X_test, y_test)
    viz.poof()
    print('(Above curves plotted after one more train test split)')
else:
    print("\nCondition unreachable in 40 loops.")

end = time.time()
print('\nTime taken: ', str(end-start))

del X, X_test, X_train, all_columns, categorical, classif_report, col, df, end
del iris, nanCounts, nanPercent, no_of_attempts, numerical, start, valueCounts
del idx_sepal_length, idx_sepal_width, idx_petal_length, idx_petal_width, idx_target, threshold
del y, y_pred, y_test, y_train, condition
	#!/usr/bin/env python3
	# -- coding: utf-8 --
	"""
	@author: tanveer
	"""

	"""On Spyder editor hit F5. On jupyter-notebook paste in a single cell and press ctrl+Enter. Run atleast 15 times."""
	threshold = 0.70 # TRY thresholds -> {0.72, 0.73, 0.74, 0.75}

	import time
	start = time.time()

	valueCounts = {}
	def CountAll():
	global all_columns, nanCounts, valueCounts, nanPercent
	all_columns = list(df)
	nanCounts = df.isnull().sum()
	nanPercent = nanCounts / len(df) * 100
	for x in all_columns:
	valueCounts[x] = df[x].value_counts()

	"""-------------------------------------------------------------------------"""

	"""Random but proportional replacement(RBPR) of categoricals."""
	def Fill_NaNs_Catigorical(col):
	"""Calculating probability and expected value."""
	proportion = np.array(valueCounts[col].values) / valueCounts[col].sum() * nanCounts[col]
	proportion = np.around(proportion).astype('int')

	"""Adjusting proportion."""
	diff = int(nanCounts[col] - np.sum(proportion))
	if diff > 0:
	for x in range(diff):
	idx = random.randint(0, len(proportion) - 1)
	proportion[idx] = proportion[idx] + 1
	else:
	diff = -diff
	while(diff != 0):
	idx = random.randint(0, len(proportion) - 1)
	if proportion[idx] > 0:
	proportion[idx] = proportion[idx] - 1
	diff = diff - 1

	"""Filling NaNs."""
	nan_indexes = df[df[col].isnull()].index.tolist()
	for x in range(len(proportion)):
	if proportion[x] > 0:
	random_subset = random.sample(population = nan_indexes, k = proportion[x])
	df.loc[random_subset, col] = valueCounts[col].keys()[x]
	nan_indexes = list(set(nan_indexes) - set(random_subset))

	"""-------------------------------------------------------------------------"""

	"""Random but proportional replacement(RBPR) of numeric"""
	def Fill_NaNs_Numeric(col):

	mini = df[col].min()
	maxi = df[col].max()
	"""Selecting ONLY non-NaNs."""
	temp = df[df[col].notnull()][col] # type --> pd.Series

	"""Any continuous data is 'always' divided into 45 bins (Hard-Coded)."""
	bin_size = 45
	bins = np.linspace(mini, maxi, bin_size)

	"""Filling the bins (with non-NaNs) and calculating mean of each bin."""
	non_NaNs_per_bin = []
	mean_of_bins = []

	non_NaNs_per_bin.append(len(temp[(temp <= bins[0])]))
	mean_of_bins.append(temp[(temp <= bins[0])].mean())
	for x in range(1, bin_size):
	non_NaNs_per_bin.append(len(temp[(temp <= bins[x]) & (temp > bins[x-1])]))
	mean_of_bins.append(temp[(temp <= bins[x]) & (temp > bins[x-1])].mean())

	mean_of_bins = pd.Series(mean_of_bins)
	# np.around() on list 'proportion' may create trouble and we may get a zero-value imputed, hence,
	mean_of_bins.fillna(temp.mean(), inplace= True)
	non_NaNs_per_bin = np.array(non_NaNs_per_bin)

	"""Followoing part is SAME as Fill_NaNs_Catigorical()"""

	"""Calculating probability and expected value."""
	proportion = np.array(non_NaNs_per_bin) / valueCounts[col].sum() * nanCounts[col]
	proportion = np.around(proportion).astype('int')

	"""Adjusting proportion."""
	diff = int(nanCounts[col] - np.sum(proportion))
	if diff > 0:
	for x in range(diff):
	idx = random.randint(0, len(proportion) - 1)
	proportion[idx] = proportion[idx] + 1
	else:
	diff = -diff
	while(diff != 0):
	idx = random.randint(0, len(proportion) - 1)
	if proportion[idx] > 0:
	proportion[idx] = proportion[idx] - 1
	diff = diff - 1

	"""Filling NaNs."""
	nan_indexes = df[df[col].isnull()].index.tolist()
	for x in range(len(proportion)):
	if proportion[x] > 0:
	random_subset = random.sample(population= nan_indexes, k= proportion[x])
	df.loc[random_subset, col] = mean_of_bins[x] # <--- Replacing with bin mean
	nan_indexes = list(set(nan_indexes) - set(random_subset))

	"""-------------------------------------------------------------------------"""

	import pandas as pd
	import numpy as np
	import random
	from sklearn.datasets import load_iris
	from sklearn.tree import DecisionTreeClassifier
	from sklearn.ensemble import RandomForestClassifier
	from sklearn.model_selection import train_test_split as tts
	from sklearn.metrics import classification_report
	from yellowbrick.classifier import PrecisionRecallCurve
	import matplotlib.pyplot as plt
	import warnings
	warnings.filterwarnings('ignore')

	# Important so that results are reproducible
	np.random.seed = 0
	random.seed = 0


	""" STEP-1 """
	iris = load_iris()
	# Already free of impurities so .copy() not required
	df = pd.DataFrame(iris.data, columns=iris.feature_names)
	df['target'] = iris.target
	df['target'].replace(to_replace= [0,1,2], value= iris.target_names, inplace= True)
	df['target'] = df['target'].astype('object')

	# Removing 20% values from each column
	idx_sepal_length = list(random.sample(range(len(df)), 30))
	idx_sepal_width = list(random.sample(range(len(df)), 30))
	idx_petal_length = list(random.sample(range(len(df)), 30))
	idx_petal_width = list(random.sample(range(len(df)), 30))
	idx_target = list(random.sample(range(len(df)), 30))
	df.loc[idx_sepal_length, 'sepal length (cm)'] = np.NaN
	df.loc[idx_sepal_width, 'sepal width (cm)'] = np.NaN
	df.loc[idx_petal_length, 'petal length (cm)'] = np.NaN
	df.loc[idx_petal_width, 'petal width (cm)'] = np.NaN
	df.loc[idx_target, 'target'] = np.NaN


	""" STEP-2 """
	CountAll()


	""" STEP-3 """
	numerical = list(df.columns[df.dtypes == np.number])


	""" STEP-4 """
	categorical = list(df.columns[df.dtypes != np.number])


	""" STEP-5 """
	original_mean = pd.Series()
	imputed_mean = pd.Series()
	for col in numerical:
	original_mean[col] = df[col].mean()
	Fill_NaNs_Numeric(col)
	imputed_mean[col] = df[col].mean()
	print()
	print('% change in mean:')
	print((original_mean - imputed_mean)/original_mean*100)
	print()
	del original_mean, imputed_mean


	""" STEP-6 """
	for col in categorical:
	print('Mode before imputing target: ', df[col].mode()[0])
	Fill_NaNs_Catigorical(col)
	print('Mode after imputing target: ', df[col].mode()[0])


	""" STEP-7 """
	X = df.drop(columns= 'target')
	y = df['target']
	X_train, X_test, y_train, y_test = tts(X, y, test_size= 0.3) # no random_state specified
	dtC = DecisionTreeClassifier(max_depth= 3, criterion='entropy')
	dtC.fit(X_train, y_train)
	y_pred = dtC.predict(X_test)

	#print(classification_report(y_test, y_pred, output_dict=False))
	classif_report = classification_report(y_test, y_pred, output_dict=True)
	print('\n::BEFORE tuning (test scores for DecisionTreeClassifier)')
	print('setosa recall: ', classif_report['setosa']['recall'])
	print('virginica recall: ', classif_report['virginica']['recall'])
	print('versicolor recall: ', classif_report['versicolor']['recall'])


	""" STEP-8 """
	condition = 1
	no_of_attempts = 0
	while(not((classif_report['setosa']['recall'] >= threshold) and
	(classif_report['virginica']['recall'] >= threshold) and
	(classif_report['versicolor']['recall'] >= threshold))):

	df.loc[idx_sepal_length, 'sepal length (cm)'] = np.NaN
	df.loc[idx_sepal_width, 'sepal width (cm)'] = np.NaN
	df.loc[idx_petal_length, 'petal length (cm)'] = np.NaN
	df.loc[idx_petal_width, 'petal width (cm)'] = np.NaN
	df.loc[idx_target, 'target'] = np.NaN

	CountAll()
	for col in numerical:
	Fill_NaNs_Numeric(col)
	for col in categorical:
	Fill_NaNs_Catigorical(col)

	X = df.drop(columns= 'target')
	y = df['target']
	X_train, X_test, y_train, y_test = tts(X, y, test_size= 0.3) # no random_state specified
	dtC = DecisionTreeClassifier(max_depth= 3, criterion='entropy')
	dtC.fit(X_train, y_train)
	y_pred = dtC.predict(X_test)

	classif_report = classification_report(y_test, y_pred, output_dict=True)

	if no_of_attempts == 40:
	condition = 0
	break

	no_of_attempts = no_of_attempts + 1

	if condition:
	print('\n::AFTER tuning (test scores for DecisionTreeClassifier)')
	print('setosa recall: ', classif_report['setosa']['recall'])
	print('virginica recall: ', classif_report['virginica']['recall'])
	print('versicolor recall: ', classif_report['versicolor']['recall'])
	print('\nno of attempts in while loop: ', no_of_attempts)

	fig = plt.figure(figsize=(8,8))
	X = df.drop(columns= 'target')
	y = df['target']
	y, uniques = pd.factorize(y)
	X_train, X_test, y_train, y_test = tts(X, y, test_size= 0.3) # no random_state specified
	viz = PrecisionRecallCurve(RandomForestClassifier(n_estimators= 100), per_class=True, iso_f1_curves=True, fill_area=False, micro=False)
	viz.fit(X_train, y_train)
	viz.score(X_test, y_test)
	viz.poof()
	print('(Above curves plotted after one more train test split)')
	else:
	print("\nCondition unreachable in 40 loops.")

	end = time.time()
	print('\nTime taken: ', str(end-start))

	del X, X_test, X_train, all_columns, categorical, classif_report, col, df, end
	del iris, nanCounts, nanPercent, no_of_attempts, numerical, start, valueCounts
	del idx_sepal_length, idx_sepal_width, idx_petal_length, idx_petal_width, idx_target, threshold
	del y, y_pred, y_test, y_train, condition