Created
July 17, 2019 21:01
-
-
Save ishwor2048/ead8ad6bfd4bd2767a452b449d3b110c to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#Importing required packages | |
import pandas as pd | |
import matplotlib.pyplot as plt | |
import seaborn as sns | |
from sklearn.model_selection import train_test_split # train/test split | |
from sklearn.neighbors import KNeighborsClassifier | |
from sklearn.model_selection import cross_val_score | |
from sklearn.ensemble import RandomForestClassifier | |
from sklearn.model_selection import GridSearchCV | |
from sklearn.metrics import confusion_matrix | |
from sklearn.ensemble import GradientBoostingClassifier | |
from sklearn.metrics import classification_report | |
from sklearn.linear_model import LogisticRegression | |
from sklearn import metrics | |
from sklearn.metrics import roc_auc_score | |
#importing data file into terminal with the dataframe name "char_pred" | |
char_pred = pd.read_excel("GOT_character_predictions.xlsx") | |
############################################################################### | |
# Exploring the Dataset one by one | |
############################################################################### | |
char_pred.info() #Brief information about the dataset with rows, columns and null values | |
char_pred.head() #Looking out first five rows of the dataset | |
char_pred.columns #Printing out just column names of the dataset | |
char_pred.describe() #Analyzing Count, Mean, Std, min, first quartile, mean, third Quartile and max values of each columns | |
char_pred.shape #Looking for rows and column counts of the dataset | |
#Missing value research | |
print( | |
char_pred.columns | |
.isnull() | |
.sum() | |
) | |
#Flagging missing values and creating seperate column with missing value impute | |
for col in char_pred: | |
""" Create columns that are 0s if a value was not missing and 1 if | |
a value is missing. """ | |
if char_pred[col].isnull().any(): | |
char_pred['m_'+col] = char_pred[col].isnull().astype(int) | |
############################################################################### | |
# Data Exploration | |
############################################################################### | |
#Correlation Analysis | |
df_corr = char_pred.corr().round(2) | |
print(df_corr) | |
df_corr.loc['isAlive'].sort_values(ascending = False) | |
############################################################################### | |
#Filling out dummies in each of the variables | |
############################################################################### | |
#Filling out na and dummies for culture | |
char_pred['culture'] = char_pred['culture'].fillna('unknown') | |
culture_dummies = pd.get_dummies((char_pred['culture']), | |
drop_first=True) | |
#Filling na and dummies for house | |
char_pred['house'] = char_pred['house'].fillna('unknown') | |
house_dummies = pd.get_dummies((char_pred['house']), | |
drop_first=True) | |
#Filling na and dummies for mother | |
char_pred['mother'] = char_pred['mother'].fillna('unknown') | |
mother_dummies = pd.get_dummies((char_pred['mother']), | |
drop_first=True) | |
#Filling na and dummies for father | |
char_pred['father'] = char_pred['father'].fillna('unknown') | |
father_dummies = pd.get_dummies((char_pred['father']), | |
drop_first=True) | |
#Filling na and dummies for heir | |
char_pred['heir'] = char_pred['heir'].fillna('unknown') | |
heir_dummies = pd.get_dummies((char_pred['heir']), | |
drop_first=True) | |
#Filling na and dummies for spouse | |
char_pred['spouse'] = char_pred['spouse'].fillna('unknown') | |
spouse_dummies = pd.get_dummies((char_pred['spouse']), | |
drop_first=True) | |
#Time to work for age | |
char_pred['age'][char_pred['age'] < 0] = 0 | |
char_pred['age'] = char_pred['age'].fillna(pd.np.mean(char_pred['age'])) | |
#Working on Date of Birth | |
char_pred['dateOfBirth'][char_pred['dateOfBirth'] < 0] = 0 | |
char_pred['dateOfBirth'] = char_pred['dateOfBirth'].fillna(pd.np.mean(char_pred['dateOfBirth'])) | |
#Now Checking if there is missing values | |
print( | |
char_pred.columns | |
.isnull() | |
.any() | |
) | |
############################################################################### | |
# Train Test Split : Building Base Model | |
############################################################################### | |
# Preparing our model for train test split | |
char_pred_data_1 = char_pred.loc[:,['male', | |
'book1_A_Game_Of_Thrones', | |
'book2_A_Clash_Of_Kings', | |
'book3_A_Storm_Of_Swords', | |
'book4_A_Feast_For_Crows', | |
'book5_A_Dance_with_Dragons', | |
'isMarried', | |
'isNoble', | |
'numDeadRelations', | |
'popularity', | |
'm_title', | |
'm_culture', | |
'm_dateOfBirth', | |
'm_mother', | |
'm_father', | |
'm_heir', | |
'm_spouse', | |
'm_isAliveMother', | |
'm_isAliveFather', | |
'm_isAliveHeir', | |
'm_isAliveSpouse', | |
'm_age']] | |
#Concatenating the Dataset and Dummies to create the training set: | |
char_pred_data = pd.concat([char_pred_data_1.iloc[:,:], | |
house_dummies, | |
culture_dummies], | |
axis=1) | |
#target_variable | |
for col in char_pred: | |
print(col) | |
#Setting up test (target) variable: | |
char_pred_target = char_pred['isAlive'] | |
#Train/Test Split | |
X_train, X_test, y_train, y_test = train_test_split( | |
char_pred_data, | |
char_pred_target, | |
test_size=0.10, | |
random_state=508, | |
stratify=char_pred_target) | |
############################################################################### | |
# Predictive Models | |
############################################################################### | |
############################################################################### | |
############################################################################### | |
# Random Forest Classifier in scikit-learn | |
############################################################################### | |
############################################################################### | |
# Full forest using gini | |
full_forest_gini = RandomForestClassifier(n_estimators = 500, | |
criterion = 'gini', | |
max_depth = None, | |
min_samples_leaf = 15, | |
bootstrap = True, | |
warm_start = False, | |
random_state = 508) | |
# Full forest using entropy | |
full_forest_entropy = RandomForestClassifier(n_estimators = 500, | |
criterion = 'entropy', | |
max_depth = None, | |
min_samples_leaf = 15, | |
bootstrap = True, | |
warm_start = False, | |
random_state = 508) | |
# Fitting the models | |
full_gini_fit = full_forest_gini.fit(X_train, y_train) | |
full_entropy_fit = full_forest_entropy.fit(X_train, y_train) | |
#Checking if predictions are the same for each model | |
pd.DataFrame(full_gini_fit.predict(X_test), full_entropy_fit.predict(X_test)) | |
full_gini_fit.predict(X_test).sum() == full_entropy_fit.predict(X_test).sum() | |
# Scoring the gini model | |
print('Training Score', full_gini_fit.score(X_train, y_train).round(4)) | |
print('Testing Score:', full_gini_fit.score(X_test, y_test).round(4)) | |
# Scoring the entropy model | |
print('Training Score', full_entropy_fit.score(X_train, y_train).round(4)) | |
print('Testing Score:', full_entropy_fit.score(X_test, y_test).round(4)) | |
#Training Score 0.7459 | |
#Testing Score 0.7436 | |
# Saving score objects | |
gini_full_train = full_gini_fit.score(X_train, y_train) | |
gini_full_test = full_gini_fit.score(X_test, y_test) | |
entropy_full_train = full_entropy_fit.score(X_train, y_train) | |
entropy_full_test = full_entropy_fit.score(X_test, y_test) | |
#################################### | |
# Parameter tuning with GridSearchCV | |
#################################### | |
# Creating a hyperparameter grid | |
estimator_space = pd.np.arange(100, 1350, 250) | |
leaf_space = pd.np.arange(1, 150, 15) | |
criterion_space = ['gini', 'entropy'] | |
bootstrap_space = [True, False] | |
warm_start_space = [True, False] | |
param_grid = {'n_estimators' : estimator_space, | |
'min_samples_leaf' : leaf_space, | |
'criterion' : criterion_space, | |
'bootstrap' : bootstrap_space, | |
'warm_start' : warm_start_space} | |
# Building the model object one more time | |
full_forest_grid = RandomForestClassifier(max_depth = None, | |
random_state = 508) | |
# Creating a GridSearchCV object | |
full_forest_cv = GridSearchCV(full_forest_grid, param_grid, cv = 3) | |
# Fit it to the training data | |
full_forest_cv.fit(X_train, y_train) | |
# Print the optimal parameters and best score | |
print("Tuned Logistic Regression Parameter:", full_forest_cv.best_params_) | |
print("Tuned Logistic Regression Accuracy:", full_forest_cv.best_score_.round(4)) | |
# Cross Validating the RandomForest model with three folds | |
cv_rf_3 = cross_val_score(full_forest_gini, | |
char_pred_data, | |
char_pred_target, | |
cv = 3) | |
print(cv_rf_3) | |
print(pd.np.mean(cv_rf_3).round(3)) | |
print('\nAverage: ', | |
pd.np.mean(cv_rf_3).round(3), | |
'\nMinimum: ', | |
min(cv_rf_3).round(3), | |
'\nMaximum: ', | |
max(cv_rf_3).round(3)) | |
#output: | |
#Average: 0.746 | |
#Minimum: 0.745 | |
#Maximum: 0.746 | |
############################################################################### | |
############################################################################### | |
# Gradient Boosted Machines | |
############################################################################### | |
############################################################################### | |
# Building a full model for GBM | |
gbm_got = GradientBoostingClassifier(loss = 'deviance', | |
learning_rate = 1.5, | |
n_estimators = 75, | |
max_depth = 1, | |
criterion = 'mse', | |
warm_start = False, | |
random_state = 508, | |
) | |
# Fitting gbm model on training dataset | |
gbm_got_fit = gbm_got.fit(X_train, y_train) | |
# predicting target based on X_test set | |
gbm_predict = gbm_got_fit.predict(X_test) | |
# predicting probability of survival | |
gbm_predict_prob = gbm_got_fit.predict_proba(X_test) | |
# Training and Testing Scores | |
print('Training Score', gbm_got_fit.score(X_train, y_train).round(4)) | |
print('Testing Score:', gbm_got_fit.score(X_test, y_test).round(4)) | |
#Training Score 0.8132 | |
#Testing Score 0.7692 | |
# Saving training and testing scores | |
TraiScore_gbm = gbm_got_fit.score(X_train, y_train).round(3) | |
TestScore_gbm = gbm_got_fit.score(X_test, y_test).round(3) | |
# Cross Validation Score | |
CV_gbm = cross_val_score(gbm_got_fit, | |
char_pred_data, | |
char_pred_target, | |
cv = 3) | |
print('\nAverage: ', | |
pd.np.mean(CV_gbm).round(5), | |
'\nMinimum: ', | |
min(CV_gbm).round(5), | |
'\nMaximum: ', | |
max(CV_gbm).round(5)) | |
#Average: 0.76926 | |
#Minimum: 0.74074 | |
#Maximum: 0.78737 | |
# Computing AUC score and save it | |
AUC_gbm = roc_auc_score(y_test, gbm_predict_prob[:, 1]).round(3) | |
#0.741 | |
############################################################################### | |
############################################################################### | |
# Developing a Classification Base with KNearestNeighbor | |
############################################################################### | |
############################################################################### | |
# Running the neighbor optimization code with a small adjustment for classification | |
training_accuracy = [] | |
test_accuracy = [] | |
neighbors_settings = range(1, 51) | |
for n_neighbors in neighbors_settings: | |
# build the model | |
clf = KNeighborsClassifier(n_neighbors = n_neighbors) | |
clf.fit(X_train, | |
y_train.values.ravel()) | |
# record training set accuracy | |
training_accuracy.append(clf.score(X_train, | |
y_train)) | |
# record generalization accuracy | |
test_accuracy.append(clf.score(X_test, | |
y_test)) | |
#Plotting the accuracy score | |
fig, ax = plt.subplots(figsize=(12,9)) | |
plt.plot(neighbors_settings, | |
training_accuracy, | |
label = "training accuracy") | |
plt.plot(neighbors_settings, | |
test_accuracy, | |
label = "test accuracy") | |
plt.ylabel("Accuracy") | |
plt.xlabel("n_neighbors") | |
plt.legend() | |
plt.show() | |
# exploring the highest test accuracy | |
print(test_accuracy) | |
# Printing highest test accuracy | |
print(test_accuracy.index(max(test_accuracy)) + 1) | |
# It looks like 4 neighbors is the most accurate | |
knn_clf = KNeighborsClassifier(n_neighbors = 4) | |
# Fitting the model based on the training data | |
knn_clf_fit = knn_clf.fit(X_train, | |
y_train) | |
#try adding .values.ravel() to you code as in the code below | |
knn_clf_fit = knn_clf.fit(X_train, | |
y_train.values.ravel()) | |
# Let's compare the testing score to the training score. | |
print('Training Score', knn_clf_fit.score(X_train, | |
y_train).round(4)) | |
print('Testing Score:', knn_clf_fit.score(X_test, | |
y_test).round(4)) | |
#Training Score 0.8538 | |
#Testing Score: 0.8103 | |
# Generating Predictions based on the optimal KNN model | |
knn_clf_pred = knn_clf_fit.predict(X_test) | |
knn_clf_pred_probabilities = knn_clf_fit.predict_proba(X_test) | |
#Training Score 0.8538 | |
#Testing Score 0.8103 | |
############################################################################### | |
# Cross Validation with k-folds for KNN (best score) | |
############################################################################### | |
# Cross Validating the knn model with three folds | |
cv_knn_3 = cross_val_score(knn_clf, | |
char_pred_data, | |
char_pred_target, | |
cv = 3) | |
print(cv_knn_3) | |
print(pd.np.mean(cv_knn_3).round(3)) | |
print('\nAverage: ', | |
pd.np.mean(cv_knn_3).round(3), | |
'\nMinimum: ', | |
min(cv_knn_3).round(3), | |
'\nMaximum: ', | |
max(cv_knn_3).round(3)) | |
#Average: 0.729 | |
#Minimum: 0.682 | |
#Maximum: 0.763 | |
#Creating a confusion matrix for KNN | |
print(confusion_matrix(y_true = y_test, | |
y_pred = knn_clf_pred)) | |
#Providing label to the confusion matrix | |
labels = ['Dead', | |
'Alive'] | |
cm = confusion_matrix(y_true = y_test, | |
y_pred = knn_clf_pred) | |
#Creating Heatmap | |
sns.heatmap(cm, | |
annot = True, | |
xticklabels = labels, | |
yticklabels = labels, | |
cmap = 'Blues') | |
plt.xlabel('Predicted') | |
plt.ylabel('Actual') | |
plt.title('Confusion matrix of the KNN classifier') | |
plt.show() | |
############################################################################### | |
# ROC Curve | |
############################################################################### | |
y_pred_proba = clf.predict_proba(X_test)[::,1] | |
fpr, tpr, _ = metrics.roc_curve(y_test, y_pred_proba) | |
auc = metrics.roc_auc_score(y_test, y_pred_proba).round(3) | |
plt.plot(fpr,tpr,label="KNearestNeighbor, auc="+str(auc)) | |
plt.legend(loc=4) | |
plt.show() | |
#AUC Score : 0.7704 | |
############################################################################### | |
############################################################################### | |
# Let's compare model with the LogisticRegression | |
############################################################################### | |
############################################################################### | |
#Initiating logistic regression base | |
logreg = LogisticRegression(C = 1) | |
#Fitting training and testing dataset | |
logreg_fit = logreg.fit(X_train, | |
y_train) | |
# Running Predictions | |
logreg_pred = logreg_fit.predict(X_test) | |
# Let's compare the testing score to the training score. | |
print('Training Score', logreg_fit.score(X_train, | |
y_train).round(4)) | |
print('Testing Score:', logreg_fit.score(X_test, | |
y_test).round(4)) | |
#Training Score 0.8218 | |
#Testing Score: 0.7538 | |
#Creating a confusion matrix for logreg | |
print(confusion_matrix(y_true = y_test, | |
y_pred = logreg_pred)) | |
#Providing label to the confusion matrix | |
labels = ['Dead', | |
'Alive'] | |
cm = confusion_matrix(y_true = y_test, | |
y_pred = logreg_pred) | |
#Creating Heatmap | |
sns.heatmap(cm, | |
annot = True, | |
xticklabels = labels, | |
yticklabels = labels, | |
cmap = 'Blues') | |
plt.xlabel('Predicted') | |
plt.ylabel('Actual') | |
plt.title('Confusion matrix of the classifier') | |
plt.show() | |
############################################################################### | |
# Creating a classification report | |
############################################################################### | |
print(classification_report(y_true = y_test, | |
y_pred = logreg_pred)) | |
# Changing the labels on the classification report | |
print(classification_report(y_true = y_test, | |
y_pred = logreg_pred, | |
target_names = labels)) | |
#Now we will export the predictions to excel sheet for submission | |
ishwor_model_prediction = pd.DataFrame({'Actual' : y_test, | |
'KNN_Predicted': knn_clf_pred}) | |
ishwor_model_prediction.to_excel("Ishwor_Model_Prediction_KNN.xlsx") | |
#Now we will export the Final Missing Imputed & worked sheet to local drive | |
char_pred.to_excel("ishwor_got_file_after_code.xlsx") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment