Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save ishwor2048/ead8ad6bfd4bd2767a452b449d3b110c to your computer and use it in GitHub Desktop.
Save ishwor2048/ead8ad6bfd4bd2767a452b449d3b110c to your computer and use it in GitHub Desktop.
#Importing required packages
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split # train/test split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.metrics import roc_auc_score
#importing data file into terminal with the dataframe name "char_pred"
char_pred = pd.read_excel("GOT_character_predictions.xlsx")
###############################################################################
# Exploring the Dataset one by one
###############################################################################
char_pred.info() #Brief information about the dataset with rows, columns and null values
char_pred.head() #Looking out first five rows of the dataset
char_pred.columns #Printing out just column names of the dataset
char_pred.describe() #Analyzing Count, Mean, Std, min, first quartile, mean, third Quartile and max values of each columns
char_pred.shape #Looking for rows and column counts of the dataset
#Missing value research
print(
char_pred.columns
.isnull()
.sum()
)
#Flagging missing values and creating seperate column with missing value impute
for col in char_pred:
""" Create columns that are 0s if a value was not missing and 1 if
a value is missing. """
if char_pred[col].isnull().any():
char_pred['m_'+col] = char_pred[col].isnull().astype(int)
###############################################################################
# Data Exploration
###############################################################################
#Correlation Analysis
df_corr = char_pred.corr().round(2)
print(df_corr)
df_corr.loc['isAlive'].sort_values(ascending = False)
###############################################################################
#Filling out dummies in each of the variables
###############################################################################
#Filling out na and dummies for culture
char_pred['culture'] = char_pred['culture'].fillna('unknown')
culture_dummies = pd.get_dummies((char_pred['culture']),
drop_first=True)
#Filling na and dummies for house
char_pred['house'] = char_pred['house'].fillna('unknown')
house_dummies = pd.get_dummies((char_pred['house']),
drop_first=True)
#Filling na and dummies for mother
char_pred['mother'] = char_pred['mother'].fillna('unknown')
mother_dummies = pd.get_dummies((char_pred['mother']),
drop_first=True)
#Filling na and dummies for father
char_pred['father'] = char_pred['father'].fillna('unknown')
father_dummies = pd.get_dummies((char_pred['father']),
drop_first=True)
#Filling na and dummies for heir
char_pred['heir'] = char_pred['heir'].fillna('unknown')
heir_dummies = pd.get_dummies((char_pred['heir']),
drop_first=True)
#Filling na and dummies for spouse
char_pred['spouse'] = char_pred['spouse'].fillna('unknown')
spouse_dummies = pd.get_dummies((char_pred['spouse']),
drop_first=True)
#Time to work for age
char_pred['age'][char_pred['age'] < 0] = 0
char_pred['age'] = char_pred['age'].fillna(pd.np.mean(char_pred['age']))
#Working on Date of Birth
char_pred['dateOfBirth'][char_pred['dateOfBirth'] < 0] = 0
char_pred['dateOfBirth'] = char_pred['dateOfBirth'].fillna(pd.np.mean(char_pred['dateOfBirth']))
#Now Checking if there is missing values
print(
char_pred.columns
.isnull()
.any()
)
###############################################################################
# Train Test Split : Building Base Model
###############################################################################
# Preparing our model for train test split
char_pred_data_1 = char_pred.loc[:,['male',
'book1_A_Game_Of_Thrones',
'book2_A_Clash_Of_Kings',
'book3_A_Storm_Of_Swords',
'book4_A_Feast_For_Crows',
'book5_A_Dance_with_Dragons',
'isMarried',
'isNoble',
'numDeadRelations',
'popularity',
'm_title',
'm_culture',
'm_dateOfBirth',
'm_mother',
'm_father',
'm_heir',
'm_spouse',
'm_isAliveMother',
'm_isAliveFather',
'm_isAliveHeir',
'm_isAliveSpouse',
'm_age']]
#Concatenating the Dataset and Dummies to create the training set:
char_pred_data = pd.concat([char_pred_data_1.iloc[:,:],
house_dummies,
culture_dummies],
axis=1)
#target_variable
for col in char_pred:
print(col)
#Setting up test (target) variable:
char_pred_target = char_pred['isAlive']
#Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(
char_pred_data,
char_pred_target,
test_size=0.10,
random_state=508,
stratify=char_pred_target)
###############################################################################
# Predictive Models
###############################################################################
###############################################################################
###############################################################################
# Random Forest Classifier in scikit-learn
###############################################################################
###############################################################################
# Full forest using gini
full_forest_gini = RandomForestClassifier(n_estimators = 500,
criterion = 'gini',
max_depth = None,
min_samples_leaf = 15,
bootstrap = True,
warm_start = False,
random_state = 508)
# Full forest using entropy
full_forest_entropy = RandomForestClassifier(n_estimators = 500,
criterion = 'entropy',
max_depth = None,
min_samples_leaf = 15,
bootstrap = True,
warm_start = False,
random_state = 508)
# Fitting the models
full_gini_fit = full_forest_gini.fit(X_train, y_train)
full_entropy_fit = full_forest_entropy.fit(X_train, y_train)
#Checking if predictions are the same for each model
pd.DataFrame(full_gini_fit.predict(X_test), full_entropy_fit.predict(X_test))
full_gini_fit.predict(X_test).sum() == full_entropy_fit.predict(X_test).sum()
# Scoring the gini model
print('Training Score', full_gini_fit.score(X_train, y_train).round(4))
print('Testing Score:', full_gini_fit.score(X_test, y_test).round(4))
# Scoring the entropy model
print('Training Score', full_entropy_fit.score(X_train, y_train).round(4))
print('Testing Score:', full_entropy_fit.score(X_test, y_test).round(4))
#Training Score 0.7459
#Testing Score 0.7436
# Saving score objects
gini_full_train = full_gini_fit.score(X_train, y_train)
gini_full_test = full_gini_fit.score(X_test, y_test)
entropy_full_train = full_entropy_fit.score(X_train, y_train)
entropy_full_test = full_entropy_fit.score(X_test, y_test)
####################################
# Parameter tuning with GridSearchCV
####################################
# Creating a hyperparameter grid
estimator_space = pd.np.arange(100, 1350, 250)
leaf_space = pd.np.arange(1, 150, 15)
criterion_space = ['gini', 'entropy']
bootstrap_space = [True, False]
warm_start_space = [True, False]
param_grid = {'n_estimators' : estimator_space,
'min_samples_leaf' : leaf_space,
'criterion' : criterion_space,
'bootstrap' : bootstrap_space,
'warm_start' : warm_start_space}
# Building the model object one more time
full_forest_grid = RandomForestClassifier(max_depth = None,
random_state = 508)
# Creating a GridSearchCV object
full_forest_cv = GridSearchCV(full_forest_grid, param_grid, cv = 3)
# Fit it to the training data
full_forest_cv.fit(X_train, y_train)
# Print the optimal parameters and best score
print("Tuned Logistic Regression Parameter:", full_forest_cv.best_params_)
print("Tuned Logistic Regression Accuracy:", full_forest_cv.best_score_.round(4))
# Cross Validating the RandomForest model with three folds
cv_rf_3 = cross_val_score(full_forest_gini,
char_pred_data,
char_pred_target,
cv = 3)
print(cv_rf_3)
print(pd.np.mean(cv_rf_3).round(3))
print('\nAverage: ',
pd.np.mean(cv_rf_3).round(3),
'\nMinimum: ',
min(cv_rf_3).round(3),
'\nMaximum: ',
max(cv_rf_3).round(3))
#output:
#Average: 0.746
#Minimum: 0.745
#Maximum: 0.746
###############################################################################
###############################################################################
# Gradient Boosted Machines
###############################################################################
###############################################################################
# Building a full model for GBM
gbm_got = GradientBoostingClassifier(loss = 'deviance',
learning_rate = 1.5,
n_estimators = 75,
max_depth = 1,
criterion = 'mse',
warm_start = False,
random_state = 508,
)
# Fitting gbm model on training dataset
gbm_got_fit = gbm_got.fit(X_train, y_train)
# predicting target based on X_test set
gbm_predict = gbm_got_fit.predict(X_test)
# predicting probability of survival
gbm_predict_prob = gbm_got_fit.predict_proba(X_test)
# Training and Testing Scores
print('Training Score', gbm_got_fit.score(X_train, y_train).round(4))
print('Testing Score:', gbm_got_fit.score(X_test, y_test).round(4))
#Training Score 0.8132
#Testing Score 0.7692
# Saving training and testing scores
TraiScore_gbm = gbm_got_fit.score(X_train, y_train).round(3)
TestScore_gbm = gbm_got_fit.score(X_test, y_test).round(3)
# Cross Validation Score
CV_gbm = cross_val_score(gbm_got_fit,
char_pred_data,
char_pred_target,
cv = 3)
print('\nAverage: ',
pd.np.mean(CV_gbm).round(5),
'\nMinimum: ',
min(CV_gbm).round(5),
'\nMaximum: ',
max(CV_gbm).round(5))
#Average: 0.76926
#Minimum: 0.74074
#Maximum: 0.78737
# Computing AUC score and save it
AUC_gbm = roc_auc_score(y_test, gbm_predict_prob[:, 1]).round(3)
#0.741
###############################################################################
###############################################################################
# Developing a Classification Base with KNearestNeighbor
###############################################################################
###############################################################################
# Running the neighbor optimization code with a small adjustment for classification
training_accuracy = []
test_accuracy = []
neighbors_settings = range(1, 51)
for n_neighbors in neighbors_settings:
# build the model
clf = KNeighborsClassifier(n_neighbors = n_neighbors)
clf.fit(X_train,
y_train.values.ravel())
# record training set accuracy
training_accuracy.append(clf.score(X_train,
y_train))
# record generalization accuracy
test_accuracy.append(clf.score(X_test,
y_test))
#Plotting the accuracy score
fig, ax = plt.subplots(figsize=(12,9))
plt.plot(neighbors_settings,
training_accuracy,
label = "training accuracy")
plt.plot(neighbors_settings,
test_accuracy,
label = "test accuracy")
plt.ylabel("Accuracy")
plt.xlabel("n_neighbors")
plt.legend()
plt.show()
# exploring the highest test accuracy
print(test_accuracy)
# Printing highest test accuracy
print(test_accuracy.index(max(test_accuracy)) + 1)
# It looks like 4 neighbors is the most accurate
knn_clf = KNeighborsClassifier(n_neighbors = 4)
# Fitting the model based on the training data
knn_clf_fit = knn_clf.fit(X_train,
y_train)
#try adding .values.ravel() to you code as in the code below
knn_clf_fit = knn_clf.fit(X_train,
y_train.values.ravel())
# Let's compare the testing score to the training score.
print('Training Score', knn_clf_fit.score(X_train,
y_train).round(4))
print('Testing Score:', knn_clf_fit.score(X_test,
y_test).round(4))
#Training Score 0.8538
#Testing Score: 0.8103
# Generating Predictions based on the optimal KNN model
knn_clf_pred = knn_clf_fit.predict(X_test)
knn_clf_pred_probabilities = knn_clf_fit.predict_proba(X_test)
#Training Score 0.8538
#Testing Score 0.8103
###############################################################################
# Cross Validation with k-folds for KNN (best score)
###############################################################################
# Cross Validating the knn model with three folds
cv_knn_3 = cross_val_score(knn_clf,
char_pred_data,
char_pred_target,
cv = 3)
print(cv_knn_3)
print(pd.np.mean(cv_knn_3).round(3))
print('\nAverage: ',
pd.np.mean(cv_knn_3).round(3),
'\nMinimum: ',
min(cv_knn_3).round(3),
'\nMaximum: ',
max(cv_knn_3).round(3))
#Average: 0.729
#Minimum: 0.682
#Maximum: 0.763
#Creating a confusion matrix for KNN
print(confusion_matrix(y_true = y_test,
y_pred = knn_clf_pred))
#Providing label to the confusion matrix
labels = ['Dead',
'Alive']
cm = confusion_matrix(y_true = y_test,
y_pred = knn_clf_pred)
#Creating Heatmap
sns.heatmap(cm,
annot = True,
xticklabels = labels,
yticklabels = labels,
cmap = 'Blues')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion matrix of the KNN classifier')
plt.show()
###############################################################################
# ROC Curve
###############################################################################
y_pred_proba = clf.predict_proba(X_test)[::,1]
fpr, tpr, _ = metrics.roc_curve(y_test, y_pred_proba)
auc = metrics.roc_auc_score(y_test, y_pred_proba).round(3)
plt.plot(fpr,tpr,label="KNearestNeighbor, auc="+str(auc))
plt.legend(loc=4)
plt.show()
#AUC Score : 0.7704
###############################################################################
###############################################################################
# Let's compare model with the LogisticRegression
###############################################################################
###############################################################################
#Initiating logistic regression base
logreg = LogisticRegression(C = 1)
#Fitting training and testing dataset
logreg_fit = logreg.fit(X_train,
y_train)
# Running Predictions
logreg_pred = logreg_fit.predict(X_test)
# Let's compare the testing score to the training score.
print('Training Score', logreg_fit.score(X_train,
y_train).round(4))
print('Testing Score:', logreg_fit.score(X_test,
y_test).round(4))
#Training Score 0.8218
#Testing Score: 0.7538
#Creating a confusion matrix for logreg
print(confusion_matrix(y_true = y_test,
y_pred = logreg_pred))
#Providing label to the confusion matrix
labels = ['Dead',
'Alive']
cm = confusion_matrix(y_true = y_test,
y_pred = logreg_pred)
#Creating Heatmap
sns.heatmap(cm,
annot = True,
xticklabels = labels,
yticklabels = labels,
cmap = 'Blues')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion matrix of the classifier')
plt.show()
###############################################################################
# Creating a classification report
###############################################################################
print(classification_report(y_true = y_test,
y_pred = logreg_pred))
# Changing the labels on the classification report
print(classification_report(y_true = y_test,
y_pred = logreg_pred,
target_names = labels))
#Now we will export the predictions to excel sheet for submission
ishwor_model_prediction = pd.DataFrame({'Actual' : y_test,
'KNN_Predicted': knn_clf_pred})
ishwor_model_prediction.to_excel("Ishwor_Model_Prediction_KNN.xlsx")
#Now we will export the Final Missing Imputed & worked sheet to local drive
char_pred.to_excel("ishwor_got_file_after_code.xlsx")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment