ishwor2048/Game_of_Throne_Survival_Prediction.py

## Game_of_Throne_Survival_Prediction.py
#Importing required packages

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split # train/test split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.metrics import roc_auc_score


#importing data file into terminal with the dataframe name "char_pred"
char_pred = pd.read_excel("GOT_character_predictions.xlsx")


###############################################################################
#                       Exploring the Dataset one by one
###############################################################################


char_pred.info() #Brief information about the dataset with rows, columns and null values
char_pred.head() #Looking out first five rows of the dataset
char_pred.columns #Printing out just column names of the dataset
char_pred.describe() #Analyzing Count, Mean, Std, min, first quartile, mean, third Quartile and max values of each columns
char_pred.shape #Looking for rows and column counts of the dataset


#Missing value research
print(
      char_pred.columns
      .isnull()
      .sum()
      )


#Flagging missing values and creating seperate column with missing value impute
for col in char_pred:

    """ Create columns that are 0s if a value was not missing and 1 if
    a value is missing. """

    if char_pred[col].isnull().any():
        char_pred['m_'+col] = char_pred[col].isnull().astype(int)


###############################################################################
#                            Data Exploration
###############################################################################


#Correlation Analysis
df_corr = char_pred.corr().round(2)


print(df_corr)


df_corr.loc['isAlive'].sort_values(ascending = False)


###############################################################################
            #Filling out dummies in each of the variables
###############################################################################


#Filling out na and dummies for culture
char_pred['culture'] = char_pred['culture'].fillna('unknown')

culture_dummies = pd.get_dummies((char_pred['culture']),
                                 drop_first=True)


#Filling na and dummies for house
char_pred['house'] = char_pred['house'].fillna('unknown')

house_dummies = pd.get_dummies((char_pred['house']),
                               drop_first=True)


#Filling na and dummies for mother
char_pred['mother'] = char_pred['mother'].fillna('unknown')

mother_dummies = pd.get_dummies((char_pred['mother']),
                                drop_first=True)


#Filling na and dummies for father
char_pred['father'] = char_pred['father'].fillna('unknown')

father_dummies = pd.get_dummies((char_pred['father']),
                                drop_first=True)


#Filling na and dummies for heir
char_pred['heir'] = char_pred['heir'].fillna('unknown')

heir_dummies = pd.get_dummies((char_pred['heir']),
                              drop_first=True)


#Filling na and dummies for spouse
char_pred['spouse'] = char_pred['spouse'].fillna('unknown')

spouse_dummies = pd.get_dummies((char_pred['spouse']),
                                drop_first=True)


#Time to work for age
char_pred['age'][char_pred['age'] < 0] = 0

char_pred['age'] = char_pred['age'].fillna(pd.np.mean(char_pred['age']))


#Working on Date of Birth
char_pred['dateOfBirth'][char_pred['dateOfBirth'] < 0] = 0

char_pred['dateOfBirth'] = char_pred['dateOfBirth'].fillna(pd.np.mean(char_pred['dateOfBirth']))


#Now Checking if there is missing values
print(
      char_pred.columns
      .isnull()
      .any()
      )


###############################################################################
#                         Train Test Split : Building Base Model
###############################################################################


# Preparing our model for train test split
char_pred_data_1   = char_pred.loc[:,['male',
                                    'book1_A_Game_Of_Thrones',
                                    'book2_A_Clash_Of_Kings',
                                    'book3_A_Storm_Of_Swords',
                                    'book4_A_Feast_For_Crows',
                                    'book5_A_Dance_with_Dragons',
                                    'isMarried',
                                    'isNoble',
                                    'numDeadRelations',
                                    'popularity',
                                    'm_title',
                                    'm_culture',
                                    'm_dateOfBirth',
                                    'm_mother',
                                    'm_father',
                                    'm_heir',
                                    'm_spouse',
                                    'm_isAliveMother',
                                    'm_isAliveFather',
                                    'm_isAliveHeir',
                                    'm_isAliveSpouse',
                                    'm_age']]


#Concatenating the Dataset and Dummies to create the training set:
char_pred_data = pd.concat([char_pred_data_1.iloc[:,:],
                            house_dummies,
                            culture_dummies],
                            axis=1)


#target_variable
for col in char_pred:
    print(col)


#Setting up test (target) variable:
char_pred_target = char_pred['isAlive']


#Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(
        char_pred_data,
        char_pred_target,
        test_size=0.10,
        random_state=508,
        stratify=char_pred_target)


###############################################################################
#                           Predictive Models
###############################################################################


###############################################################################
###############################################################################
#               Random Forest Classifier in scikit-learn
###############################################################################
###############################################################################


# Full forest using gini
full_forest_gini = RandomForestClassifier(n_estimators = 500,
                                     criterion = 'gini',
                                     max_depth = None,
                                     min_samples_leaf = 15,
                                     bootstrap = True,
                                     warm_start = False,
                                     random_state = 508)


# Full forest using entropy
full_forest_entropy = RandomForestClassifier(n_estimators = 500,
                                     criterion = 'entropy',
                                     max_depth = None,
                                     min_samples_leaf = 15,
                                     bootstrap = True,
                                     warm_start = False,
                                     random_state = 508)


# Fitting the models
full_gini_fit = full_forest_gini.fit(X_train, y_train)

full_entropy_fit = full_forest_entropy.fit(X_train, y_train)


#Checking if predictions are the same for each model
pd.DataFrame(full_gini_fit.predict(X_test), full_entropy_fit.predict(X_test))

full_gini_fit.predict(X_test).sum() == full_entropy_fit.predict(X_test).sum()


# Scoring the gini model
print('Training Score', full_gini_fit.score(X_train, y_train).round(4))
print('Testing Score:', full_gini_fit.score(X_test, y_test).round(4))


# Scoring the entropy model
print('Training Score', full_entropy_fit.score(X_train, y_train).round(4))
print('Testing Score:', full_entropy_fit.score(X_test, y_test).round(4))


#Training Score 0.7459
#Testing Score 0.7436


# Saving score objects
gini_full_train = full_gini_fit.score(X_train, y_train)
gini_full_test  = full_gini_fit.score(X_test, y_test)

entropy_full_train = full_entropy_fit.score(X_train, y_train)
entropy_full_test  = full_entropy_fit.score(X_test, y_test)


####################################
# Parameter tuning with GridSearchCV
####################################


# Creating a hyperparameter grid
estimator_space = pd.np.arange(100, 1350, 250)
leaf_space = pd.np.arange(1, 150, 15)
criterion_space = ['gini', 'entropy']
bootstrap_space = [True, False]
warm_start_space = [True, False]


param_grid = {'n_estimators' : estimator_space,
              'min_samples_leaf' : leaf_space,
              'criterion' : criterion_space,
              'bootstrap' : bootstrap_space,
              'warm_start' : warm_start_space}


# Building the model object one more time
full_forest_grid = RandomForestClassifier(max_depth = None,
                                          random_state = 508)


# Creating a GridSearchCV object
full_forest_cv = GridSearchCV(full_forest_grid, param_grid, cv = 3)


# Fit it to the training data
full_forest_cv.fit(X_train, y_train)


# Print the optimal parameters and best score
print("Tuned Logistic Regression Parameter:", full_forest_cv.best_params_)
print("Tuned Logistic Regression Accuracy:", full_forest_cv.best_score_.round(4))


# Cross Validating the RandomForest model with three folds
cv_rf_3 = cross_val_score(full_forest_gini,
                           char_pred_data,
                           char_pred_target,
                           cv = 3)


print(cv_rf_3)


print(pd.np.mean(cv_rf_3).round(3))

print('\nAverage: ',
      pd.np.mean(cv_rf_3).round(3),
      '\nMinimum: ',
      min(cv_rf_3).round(3),
      '\nMaximum: ',
      max(cv_rf_3).round(3))

#output:
#Average: 0.746
#Minimum: 0.745
#Maximum: 0.746


###############################################################################
###############################################################################
                        # Gradient Boosted Machines
###############################################################################
###############################################################################


# Building a full model for GBM
gbm_got = GradientBoostingClassifier(loss = 'deviance',
                                  learning_rate = 1.5,
                                  n_estimators = 75,
                                  max_depth = 1,
                                  criterion = 'mse',
                                  warm_start = False,
                                  random_state = 508,
                                  )


# Fitting gbm model on training dataset
gbm_got_fit = gbm_got.fit(X_train, y_train)


# predicting target based on X_test set
gbm_predict = gbm_got_fit.predict(X_test)


# predicting probability of survival
gbm_predict_prob = gbm_got_fit.predict_proba(X_test)


# Training and Testing Scores
print('Training Score', gbm_got_fit.score(X_train, y_train).round(4))
print('Testing Score:', gbm_got_fit.score(X_test, y_test).round(4))

#Training Score 0.8132
#Testing Score 0.7692


# Saving training and testing scores
TraiScore_gbm = gbm_got_fit.score(X_train, y_train).round(3)
TestScore_gbm = gbm_got_fit.score(X_test, y_test).round(3)


# Cross Validation Score
CV_gbm = cross_val_score(gbm_got_fit,
                         char_pred_data,
                         char_pred_target,
                         cv = 3)


print('\nAverage: ',
      pd.np.mean(CV_gbm).round(5),
      '\nMinimum: ',
      min(CV_gbm).round(5),
      '\nMaximum: ',
      max(CV_gbm).round(5))

#Average:  0.76926
#Minimum:  0.74074
#Maximum:  0.78737


# Computing AUC score and save it
AUC_gbm = roc_auc_score(y_test, gbm_predict_prob[:, 1]).round(3)

#0.741


###############################################################################
###############################################################################
        # Developing a Classification Base with KNearestNeighbor
###############################################################################
###############################################################################


# Running the neighbor optimization code with a small adjustment for classification
training_accuracy = []
test_accuracy = []

neighbors_settings = range(1, 51)


for n_neighbors in neighbors_settings:
    # build the model
    clf = KNeighborsClassifier(n_neighbors = n_neighbors)
    clf.fit(X_train,
            y_train.values.ravel())

    # record training set accuracy
    training_accuracy.append(clf.score(X_train,
                                       y_train))

    # record generalization accuracy
    test_accuracy.append(clf.score(X_test,
                                   y_test))


#Plotting the accuracy score
fig, ax = plt.subplots(figsize=(12,9))
plt.plot(neighbors_settings,
         training_accuracy,
         label = "training accuracy")

plt.plot(neighbors_settings,
         test_accuracy,
         label = "test accuracy")

plt.ylabel("Accuracy")
plt.xlabel("n_neighbors")
plt.legend()
plt.show()


# exploring the highest test accuracy
print(test_accuracy)


# Printing highest test accuracy
print(test_accuracy.index(max(test_accuracy)) + 1)


# It looks like 4 neighbors is the most accurate
knn_clf = KNeighborsClassifier(n_neighbors = 4)


# Fitting the model based on the training data
knn_clf_fit = knn_clf.fit(X_train,
                          y_train)


#try adding .values.ravel() to you code as in the code below
knn_clf_fit = knn_clf.fit(X_train,
                          y_train.values.ravel())


# Let's compare the testing score to the training score.
print('Training Score', knn_clf_fit.score(X_train,
                                          y_train).round(4))

print('Testing Score:', knn_clf_fit.score(X_test,
                                          y_test).round(4))
#Training Score 0.8538
#Testing Score: 0.8103


# Generating Predictions based on the optimal KNN model
knn_clf_pred = knn_clf_fit.predict(X_test)

knn_clf_pred_probabilities = knn_clf_fit.predict_proba(X_test)

#Training Score 0.8538
#Testing Score 0.8103


###############################################################################
            # Cross Validation with k-folds for KNN (best score)
###############################################################################


# Cross Validating the knn model with three folds
cv_knn_3 = cross_val_score(knn_clf,
                           char_pred_data,
                           char_pred_target,
                           cv = 3)


print(cv_knn_3)


print(pd.np.mean(cv_knn_3).round(3))

print('\nAverage: ',
      pd.np.mean(cv_knn_3).round(3),
      '\nMinimum: ',
      min(cv_knn_3).round(3),
      '\nMaximum: ',
      max(cv_knn_3).round(3))


#Average: 0.729
#Minimum:  0.682
#Maximum:  0.763


#Creating a confusion matrix for KNN
print(confusion_matrix(y_true = y_test,
                       y_pred = knn_clf_pred))


#Providing label to the confusion matrix
labels = ['Dead',
          'Alive']

cm = confusion_matrix(y_true = y_test,
                      y_pred = knn_clf_pred)


#Creating Heatmap
sns.heatmap(cm,
            annot = True,
            xticklabels = labels,
            yticklabels = labels,
            cmap = 'Blues')


plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion matrix of the KNN classifier')
plt.show()


###############################################################################
#                               ROC Curve
###############################################################################


y_pred_proba = clf.predict_proba(X_test)[::,1]
fpr, tpr, _ = metrics.roc_curve(y_test,  y_pred_proba)
auc = metrics.roc_auc_score(y_test, y_pred_proba).round(3)
plt.plot(fpr,tpr,label="KNearestNeighbor, auc="+str(auc))
plt.legend(loc=4)
plt.show()

#AUC Score : 0.7704


###############################################################################
###############################################################################
            # Let's compare model with the LogisticRegression
###############################################################################
###############################################################################


#Initiating logistic regression base
logreg = LogisticRegression(C = 1)


#Fitting training and testing dataset
logreg_fit = logreg.fit(X_train,
                        y_train)


# Running Predictions
logreg_pred = logreg_fit.predict(X_test)


# Let's compare the testing score to the training score.
print('Training Score', logreg_fit.score(X_train,
                                         y_train).round(4))

print('Testing Score:', logreg_fit.score(X_test,
                                         y_test).round(4))

#Training Score 0.8218
#Testing Score: 0.7538


#Creating a confusion matrix for logreg

print(confusion_matrix(y_true = y_test,
                       y_pred = logreg_pred))


#Providing label to the confusion matrix
labels = ['Dead',
          'Alive']

cm = confusion_matrix(y_true = y_test,
                      y_pred = logreg_pred)


#Creating Heatmap
sns.heatmap(cm,
            annot = True,
            xticklabels = labels,
            yticklabels = labels,
            cmap = 'Blues')


plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion matrix of the classifier')
plt.show()


###############################################################################
                    # Creating a classification report
###############################################################################

print(classification_report(y_true = y_test,
                            y_pred = logreg_pred))


# Changing the labels on the classification report
print(classification_report(y_true = y_test,
                            y_pred = logreg_pred,
                            target_names = labels))


#Now we will export the predictions to excel sheet for submission
ishwor_model_prediction = pd.DataFrame({'Actual' : y_test,
                                        'KNN_Predicted': knn_clf_pred})

ishwor_model_prediction.to_excel("Ishwor_Model_Prediction_KNN.xlsx")


#Now we will export the Final Missing Imputed & worked sheet to local drive
char_pred.to_excel("ishwor_got_file_after_code.xlsx")
	#Importing required packages

	import pandas as pd
	import matplotlib.pyplot as plt
	import seaborn as sns
	from sklearn.model_selection import train_test_split # train/test split
	from sklearn.neighbors import KNeighborsClassifier
	from sklearn.model_selection import cross_val_score
	from sklearn.ensemble import RandomForestClassifier
	from sklearn.model_selection import GridSearchCV
	from sklearn.metrics import confusion_matrix
	from sklearn.ensemble import GradientBoostingClassifier
	from sklearn.metrics import classification_report
	from sklearn.linear_model import LogisticRegression
	from sklearn import metrics
	from sklearn.metrics import roc_auc_score



	#importing data file into terminal with the dataframe name "char_pred"
	char_pred = pd.read_excel("GOT_character_predictions.xlsx")



	###############################################################################
	# Exploring the Dataset one by one
	###############################################################################


	char_pred.info() #Brief information about the dataset with rows, columns and null values
	char_pred.head() #Looking out first five rows of the dataset
	char_pred.columns #Printing out just column names of the dataset
	char_pred.describe() #Analyzing Count, Mean, Std, min, first quartile, mean, third Quartile and max values of each columns
	char_pred.shape #Looking for rows and column counts of the dataset



	#Missing value research
	print(
	char_pred.columns
	.isnull()
	.sum()
	)


	#Flagging missing values and creating seperate column with missing value impute
	for col in char_pred:

	""" Create columns that are 0s if a value was not missing and 1 if
	a value is missing. """

	if char_pred[col].isnull().any():
	char_pred['m_'+col] = char_pred[col].isnull().astype(int)



	###############################################################################
	# Data Exploration
	###############################################################################


	#Correlation Analysis
	df_corr = char_pred.corr().round(2)


	print(df_corr)


	df_corr.loc['isAlive'].sort_values(ascending = False)



	###############################################################################
	#Filling out dummies in each of the variables
	###############################################################################


	#Filling out na and dummies for culture
	char_pred['culture'] = char_pred['culture'].fillna('unknown')

	culture_dummies = pd.get_dummies((char_pred['culture']),
	drop_first=True)


	#Filling na and dummies for house
	char_pred['house'] = char_pred['house'].fillna('unknown')

	house_dummies = pd.get_dummies((char_pred['house']),
	drop_first=True)


	#Filling na and dummies for mother
	char_pred['mother'] = char_pred['mother'].fillna('unknown')

	mother_dummies = pd.get_dummies((char_pred['mother']),
	drop_first=True)


	#Filling na and dummies for father
	char_pred['father'] = char_pred['father'].fillna('unknown')

	father_dummies = pd.get_dummies((char_pred['father']),
	drop_first=True)


	#Filling na and dummies for heir
	char_pred['heir'] = char_pred['heir'].fillna('unknown')

	heir_dummies = pd.get_dummies((char_pred['heir']),
	drop_first=True)



	#Filling na and dummies for spouse
	char_pred['spouse'] = char_pred['spouse'].fillna('unknown')

	spouse_dummies = pd.get_dummies((char_pred['spouse']),
	drop_first=True)


	#Time to work for age
	char_pred['age'][char_pred['age'] < 0] = 0

	char_pred['age'] = char_pred['age'].fillna(pd.np.mean(char_pred['age']))



	#Working on Date of Birth
	char_pred['dateOfBirth'][char_pred['dateOfBirth'] < 0] = 0

	char_pred['dateOfBirth'] = char_pred['dateOfBirth'].fillna(pd.np.mean(char_pred['dateOfBirth']))



	#Now Checking if there is missing values
	print(
	char_pred.columns
	.isnull()
	.any()
	)



	###############################################################################
	# Train Test Split : Building Base Model
	###############################################################################


	# Preparing our model for train test split
	char_pred_data_1 = char_pred.loc[:,['male',
	'book1_A_Game_Of_Thrones',
	'book2_A_Clash_Of_Kings',
	'book3_A_Storm_Of_Swords',
	'book4_A_Feast_For_Crows',
	'book5_A_Dance_with_Dragons',
	'isMarried',
	'isNoble',
	'numDeadRelations',
	'popularity',
	'm_title',
	'm_culture',
	'm_dateOfBirth',
	'm_mother',
	'm_father',
	'm_heir',
	'm_spouse',
	'm_isAliveMother',
	'm_isAliveFather',
	'm_isAliveHeir',
	'm_isAliveSpouse',
	'm_age']]



	#Concatenating the Dataset and Dummies to create the training set:
	char_pred_data = pd.concat([char_pred_data_1.iloc[:,:],
	house_dummies,
	culture_dummies],
	axis=1)


	#target_variable
	for col in char_pred:
	print(col)


	#Setting up test (target) variable:
	char_pred_target = char_pred['isAlive']



	#Train/Test Split
	X_train, X_test, y_train, y_test = train_test_split(
	char_pred_data,
	char_pred_target,
	test_size=0.10,
	random_state=508,
	stratify=char_pred_target)



	###############################################################################
	# Predictive Models
	###############################################################################



	###############################################################################
	###############################################################################
	# Random Forest Classifier in scikit-learn
	###############################################################################
	###############################################################################



	# Full forest using gini
	full_forest_gini = RandomForestClassifier(n_estimators = 500,
	criterion = 'gini',
	max_depth = None,
	min_samples_leaf = 15,
	bootstrap = True,
	warm_start = False,
	random_state = 508)


	# Full forest using entropy
	full_forest_entropy = RandomForestClassifier(n_estimators = 500,
	criterion = 'entropy',
	max_depth = None,
	min_samples_leaf = 15,
	bootstrap = True,
	warm_start = False,
	random_state = 508)



	# Fitting the models
	full_gini_fit = full_forest_gini.fit(X_train, y_train)

	full_entropy_fit = full_forest_entropy.fit(X_train, y_train)



	#Checking if predictions are the same for each model
	pd.DataFrame(full_gini_fit.predict(X_test), full_entropy_fit.predict(X_test))

	full_gini_fit.predict(X_test).sum() == full_entropy_fit.predict(X_test).sum()



	# Scoring the gini model
	print('Training Score', full_gini_fit.score(X_train, y_train).round(4))
	print('Testing Score:', full_gini_fit.score(X_test, y_test).round(4))


	# Scoring the entropy model
	print('Training Score', full_entropy_fit.score(X_train, y_train).round(4))
	print('Testing Score:', full_entropy_fit.score(X_test, y_test).round(4))


	#Training Score 0.7459
	#Testing Score 0.7436



	# Saving score objects
	gini_full_train = full_gini_fit.score(X_train, y_train)
	gini_full_test = full_gini_fit.score(X_test, y_test)

	entropy_full_train = full_entropy_fit.score(X_train, y_train)
	entropy_full_test = full_entropy_fit.score(X_test, y_test)


	####################################
	# Parameter tuning with GridSearchCV
	####################################


	# Creating a hyperparameter grid
	estimator_space = pd.np.arange(100, 1350, 250)
	leaf_space = pd.np.arange(1, 150, 15)
	criterion_space = ['gini', 'entropy']
	bootstrap_space = [True, False]
	warm_start_space = [True, False]



	param_grid = {'n_estimators' : estimator_space,
	'min_samples_leaf' : leaf_space,
	'criterion' : criterion_space,
	'bootstrap' : bootstrap_space,
	'warm_start' : warm_start_space}



	# Building the model object one more time
	full_forest_grid = RandomForestClassifier(max_depth = None,
	random_state = 508)


	# Creating a GridSearchCV object
	full_forest_cv = GridSearchCV(full_forest_grid, param_grid, cv = 3)



	# Fit it to the training data
	full_forest_cv.fit(X_train, y_train)



	# Print the optimal parameters and best score
	print("Tuned Logistic Regression Parameter:", full_forest_cv.best_params_)
	print("Tuned Logistic Regression Accuracy:", full_forest_cv.best_score_.round(4))




	# Cross Validating the RandomForest model with three folds
	cv_rf_3 = cross_val_score(full_forest_gini,
	char_pred_data,
	char_pred_target,
	cv = 3)


	print(cv_rf_3)


	print(pd.np.mean(cv_rf_3).round(3))

	print('\nAverage: ',
	pd.np.mean(cv_rf_3).round(3),
	'\nMinimum: ',
	min(cv_rf_3).round(3),
	'\nMaximum: ',
	max(cv_rf_3).round(3))

	#output:
	#Average: 0.746
	#Minimum: 0.745
	#Maximum: 0.746



	###############################################################################
	###############################################################################
	# Gradient Boosted Machines
	###############################################################################
	###############################################################################


	# Building a full model for GBM
	gbm_got = GradientBoostingClassifier(loss = 'deviance',
	learning_rate = 1.5,
	n_estimators = 75,
	max_depth = 1,
	criterion = 'mse',
	warm_start = False,
	random_state = 508,
	)


	# Fitting gbm model on training dataset
	gbm_got_fit = gbm_got.fit(X_train, y_train)



	# predicting target based on X_test set
	gbm_predict = gbm_got_fit.predict(X_test)


	# predicting probability of survival
	gbm_predict_prob = gbm_got_fit.predict_proba(X_test)



	# Training and Testing Scores
	print('Training Score', gbm_got_fit.score(X_train, y_train).round(4))
	print('Testing Score:', gbm_got_fit.score(X_test, y_test).round(4))

	#Training Score 0.8132
	#Testing Score 0.7692




	# Saving training and testing scores
	TraiScore_gbm = gbm_got_fit.score(X_train, y_train).round(3)
	TestScore_gbm = gbm_got_fit.score(X_test, y_test).round(3)



	# Cross Validation Score
	CV_gbm = cross_val_score(gbm_got_fit,
	char_pred_data,
	char_pred_target,
	cv = 3)


	print('\nAverage: ',
	pd.np.mean(CV_gbm).round(5),
	'\nMinimum: ',
	min(CV_gbm).round(5),
	'\nMaximum: ',
	max(CV_gbm).round(5))

	#Average: 0.76926
	#Minimum: 0.74074
	#Maximum: 0.78737



	# Computing AUC score and save it
	AUC_gbm = roc_auc_score(y_test, gbm_predict_prob[:, 1]).round(3)

	#0.741




	###############################################################################
	###############################################################################
	# Developing a Classification Base with KNearestNeighbor
	###############################################################################
	###############################################################################


	# Running the neighbor optimization code with a small adjustment for classification
	training_accuracy = []
	test_accuracy = []

	neighbors_settings = range(1, 51)


	for n_neighbors in neighbors_settings:
	# build the model
	clf = KNeighborsClassifier(n_neighbors = n_neighbors)
	clf.fit(X_train,
	y_train.values.ravel())

	# record training set accuracy
	training_accuracy.append(clf.score(X_train,
	y_train))

	# record generalization accuracy
	test_accuracy.append(clf.score(X_test,
	y_test))




	#Plotting the accuracy score
	fig, ax = plt.subplots(figsize=(12,9))
	plt.plot(neighbors_settings,
	training_accuracy,
	label = "training accuracy")

	plt.plot(neighbors_settings,
	test_accuracy,
	label = "test accuracy")

	plt.ylabel("Accuracy")
	plt.xlabel("n_neighbors")
	plt.legend()
	plt.show()




	# exploring the highest test accuracy
	print(test_accuracy)


	# Printing highest test accuracy
	print(test_accuracy.index(max(test_accuracy)) + 1)



	# It looks like 4 neighbors is the most accurate
	knn_clf = KNeighborsClassifier(n_neighbors = 4)


	# Fitting the model based on the training data
	knn_clf_fit = knn_clf.fit(X_train,
	y_train)


	#try adding .values.ravel() to you code as in the code below
	knn_clf_fit = knn_clf.fit(X_train,
	y_train.values.ravel())



	# Let's compare the testing score to the training score.
	print('Training Score', knn_clf_fit.score(X_train,
	y_train).round(4))

	print('Testing Score:', knn_clf_fit.score(X_test,
	y_test).round(4))
	#Training Score 0.8538
	#Testing Score: 0.8103



	# Generating Predictions based on the optimal KNN model
	knn_clf_pred = knn_clf_fit.predict(X_test)

	knn_clf_pred_probabilities = knn_clf_fit.predict_proba(X_test)

	#Training Score 0.8538
	#Testing Score 0.8103




	###############################################################################
	# Cross Validation with k-folds for KNN (best score)
	###############################################################################


	# Cross Validating the knn model with three folds
	cv_knn_3 = cross_val_score(knn_clf,
	char_pred_data,
	char_pred_target,
	cv = 3)


	print(cv_knn_3)


	print(pd.np.mean(cv_knn_3).round(3))

	print('\nAverage: ',
	pd.np.mean(cv_knn_3).round(3),
	'\nMinimum: ',
	min(cv_knn_3).round(3),
	'\nMaximum: ',
	max(cv_knn_3).round(3))


	#Average: 0.729
	#Minimum: 0.682
	#Maximum: 0.763


	#Creating a confusion matrix for KNN
	print(confusion_matrix(y_true = y_test,
	y_pred = knn_clf_pred))



	#Providing label to the confusion matrix
	labels = ['Dead',
	'Alive']

	cm = confusion_matrix(y_true = y_test,
	y_pred = knn_clf_pred)



	#Creating Heatmap
	sns.heatmap(cm,
	annot = True,
	xticklabels = labels,
	yticklabels = labels,
	cmap = 'Blues')


	plt.xlabel('Predicted')
	plt.ylabel('Actual')
	plt.title('Confusion matrix of the KNN classifier')
	plt.show()



	###############################################################################
	# ROC Curve
	###############################################################################


	y_pred_proba = clf.predict_proba(X_test)[::,1]
	fpr, tpr, _ = metrics.roc_curve(y_test, y_pred_proba)
	auc = metrics.roc_auc_score(y_test, y_pred_proba).round(3)
	plt.plot(fpr,tpr,label="KNearestNeighbor, auc="+str(auc))
	plt.legend(loc=4)
	plt.show()

	#AUC Score : 0.7704





	###############################################################################
	###############################################################################
	# Let's compare model with the LogisticRegression
	###############################################################################
	###############################################################################


	#Initiating logistic regression base
	logreg = LogisticRegression(C = 1)


	#Fitting training and testing dataset
	logreg_fit = logreg.fit(X_train,
	y_train)


	# Running Predictions
	logreg_pred = logreg_fit.predict(X_test)


	# Let's compare the testing score to the training score.
	print('Training Score', logreg_fit.score(X_train,
	y_train).round(4))

	print('Testing Score:', logreg_fit.score(X_test,
	y_test).round(4))

	#Training Score 0.8218
	#Testing Score: 0.7538



	#Creating a confusion matrix for logreg

	print(confusion_matrix(y_true = y_test,
	y_pred = logreg_pred))


	#Providing label to the confusion matrix
	labels = ['Dead',
	'Alive']

	cm = confusion_matrix(y_true = y_test,
	y_pred = logreg_pred)


	#Creating Heatmap
	sns.heatmap(cm,
	annot = True,
	xticklabels = labels,
	yticklabels = labels,
	cmap = 'Blues')


	plt.xlabel('Predicted')
	plt.ylabel('Actual')
	plt.title('Confusion matrix of the classifier')
	plt.show()




	###############################################################################
	# Creating a classification report
	###############################################################################

	print(classification_report(y_true = y_test,
	y_pred = logreg_pred))


	# Changing the labels on the classification report
	print(classification_report(y_true = y_test,
	y_pred = logreg_pred,
	target_names = labels))


	#Now we will export the predictions to excel sheet for submission
	ishwor_model_prediction = pd.DataFrame({'Actual' : y_test,
	'KNN_Predicted': knn_clf_pred})

	ishwor_model_prediction.to_excel("Ishwor_Model_Prediction_KNN.xlsx")


	#Now we will export the Final Missing Imputed & worked sheet to local drive
	char_pred.to_excel("ishwor_got_file_after_code.xlsx")