Created
July 17, 2019 20:59
-
-
Save ishwor2048/2b97b5ab9abc6b71ca1d394037540cf0 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#Importing Required Libraries and Packages | |
import pandas as pd | |
import seaborn as sns | |
import matplotlib.pyplot as plt | |
import statsmodels.api as sm | |
import os | |
from sklearn.model_selection import train_test_split # train/test split | |
from sklearn.neighbors import KNeighborsRegressor # KNN for Regression | |
import statsmodels.formula.api as sm # regression modeling | |
import sklearn.metrics # more metrics for model performance evaluation | |
from sklearn.model_selection import cross_val_score # k-folds cross validation | |
#Importing file to the work terminal | |
file = 'birthweight_feature_set.xlsx' | |
#made the work path file name to work more easily | |
birthweight = pd.read_excel(file) | |
############################################################################# | |
############################################################################## | |
# Working on Dataset Exploration | |
############################################################################## | |
############################################################################## | |
# Listing the name of the columns | |
birthweight.columns | |
# Displaying the first rows of the DataFrame | |
print(birthweight.head()) | |
# Dimensions of the DataFrame | |
birthweight.shape | |
# Information about each variable | |
birthweight.info() | |
# Descriptive statistics | |
birthweight.describe().round(2) | |
#Shorting the values according the birthweight higher to lower | |
birthweight.sort_values('bwght', ascending = False) | |
############################################################################### | |
# Imputing Missing Values | |
############################################################################### | |
#First checking the missing the values in the dataset | |
print( | |
birthweight | |
.isnull() | |
.sum() | |
) | |
for col in birthweight: | |
""" Create columns that are 0s if a value was not missing and 1 if | |
a value is missing. """ | |
if birthweight[col].isnull().any(): | |
birthweight['m_'+col] = birthweight[col].isnull().astype(int) | |
############################################################################## | |
# Missing values being filled with the median | |
############################################################################## | |
fill = birthweight['fage'].median() | |
birthweight['fage'] = birthweight['fage'].fillna(fill) | |
fill = birthweight['omaps'].median() | |
birthweight['omaps'] = birthweight['omaps'].fillna(fill) | |
############################################################################## | |
# Checking the overall dataset to see if there are any remaining missing values | |
############################################################################## | |
print( | |
birthweight | |
.isnull() | |
.any() | |
.any() | |
) | |
#There is no missing values anymore | |
############################################################################### | |
# Quantiles Analytis: | |
############################################################################### | |
birthweight_quantiles = birthweight.loc[:, :].quantile([0.20, | |
0.40, | |
0.60, | |
0.80, | |
1.00]) | |
print(birthweight_quantiles) | |
for col in birthweight: | |
print(col) | |
############################################################################## | |
# Visual EDA (Histograms) in order to visualize the dataset and insights | |
############################################################################## | |
plt.subplot(2, 2, 1) | |
sns.distplot(birthweight['mage'], | |
bins = 10, | |
kde = True, | |
rug = True, | |
color = 'g') | |
plt.xlabel('Mothers Age') | |
######################## | |
plt.subplot(2, 2, 2) | |
sns.distplot(birthweight['meduc'], | |
bins = 10, | |
kde = True, | |
rug = True, | |
color = 'y') | |
plt.xlabel('Mothers Education') | |
######################## | |
plt.subplot(2, 2, 3) | |
sns.distplot(birthweight['npvis'], | |
bins = 10, | |
kde = True, | |
rug = True, | |
color = 'orange') | |
plt.xlabel('npvis') | |
####################### | |
plt.subplot(2, 2, 4) | |
sns.distplot(birthweight['fage'], | |
bins = 10, | |
kde = True, | |
rug = True, | |
color = 'r') | |
plt.xlabel('Fathers Age') | |
plt.tight_layout() | |
plt.savefig('Birthweight Data Histograms.png') | |
plt.show() | |
############################################################################## | |
plt.subplot(2, 2, 1) | |
sns.distplot(birthweight['cigs'], | |
bins = 10, | |
kde = True, | |
color = 'g') | |
plt.xlabel('Mother Smoking Cigarette') | |
######################## | |
plt.subplot(2, 2, 2) | |
sns.distplot(birthweight['drink'], | |
bins = 10, | |
kde = True, | |
color = 'y') | |
plt.xlabel('Mother Drinking Alcohol') | |
plt.tight_layout() | |
plt.savefig('Birthweight Data Histograms 1 of 2.png') | |
plt.show() | |
############################################################################### | |
# Tuning and Flagging Outliers | |
############################################################################### | |
birthweight_quantiles = birthweight.loc[:, :].quantile([0.05, | |
0.40, | |
0.60, | |
0.80, | |
0.95]) | |
#Flagging outliers for Mother's Age | |
mage_lo = 18 | |
mage_hi = 50 | |
birthweight['out_mage'] = 0 | |
for val in enumerate(birthweight.loc[:,'mage']): | |
if val[1]<= mage_lo: birthweight.loc[val[0],'out_mage'] = -1 | |
for val in enumerate(birthweight.loc[:,'mage']): | |
if val[1]>= mage_hi: birthweight.loc[val[0],'out_mage'] = 1 | |
############################################################################## | |
#Flagging outliers for Number of Prenatal Visits: | |
npvis_lo = 5 | |
npvis_hi = 30 | |
birthweight['out_npvis'] = 0 | |
for val in enumerate(birthweight.loc[:,'npvis']): | |
if val[1]<= npvis_lo: birthweight.loc[val[0],'out_npvis'] = -1 | |
for val in enumerate(birthweight.loc[:,'npvis']): | |
if val[1]>= npvis_hi: birthweight.loc[val[0],'out_npvis'] = 1 | |
############################################################################### | |
#Flagging outliers for Number of Prenatal Visits: | |
npvis_lo = 5 | |
npvis_hi = 30 | |
birthweight['out_npvis'] = 0 | |
for val in enumerate(birthweight.loc[:,'npvis']): | |
if val[1]<= npvis_lo: birthweight.loc[val[0],'out_npvis'] = -1 | |
for val in enumerate(birthweight.loc[:,'npvis']): | |
if val[1]>= npvis_hi: birthweight.loc[val[0],'out_npvis'] = 1 | |
############################################################################### | |
# Correlation Analysis | |
############################################################################### | |
birthweight.head() | |
df_corr = birthweight.corr().round(2) | |
print(df_corr) | |
df_corr.loc['bwght'].sort_values(ascending = False) | |
#cigs,m drink, mage and fage are the best variables for the prediction | |
""" | |
bwght 1.00 | |
omaps 0.25 | |
fmaps 0.25 | |
feduc 0.13 | |
mblck 0.13 | |
fblck 0.12 | |
male 0.11 | |
meduc 0.09 | |
m_npvis 0.06 | |
npvis 0.06 | |
m_feduc -0.00 | |
moth -0.02 | |
fwhte -0.04 | |
monpre -0.05 | |
foth -0.08 | |
mwhte -0.11 | |
m_meduc -0.13 | |
fage -0.40 | |
mage -0.46 | |
cigs -0.57 | |
drink -0.74 | |
""" | |
############################################################################### | |
# Correlation Heatmap | |
############################################################################### | |
# Using palplot to view a color scheme | |
sns.palplot(sns.color_palette('coolwarm', 12)) | |
fig, ax = plt.subplots(figsize=(15,15)) | |
df_corr2 = df_corr.iloc[1:19, 1:19] | |
sns.heatmap(df_corr2, | |
cmap = 'coolwarm', | |
square = True, | |
annot = True, | |
linecolor = 'black', | |
linewidths = 0.5) | |
plt.savefig('Birthweight Correlation Heatmap.png') | |
plt.show() | |
############################################################################# | |
############################################################################# | |
# Now time to Build up the prediction models | |
############################################################################# | |
############################################################################# | |
############################################################################## | |
# Base Model | |
############################################################################## | |
X = birthweight.drop(['bwght'], axis = 1) | |
y = birthweight.loc[:, 'bwght'] | |
X_train, X_test, y_train, y_test = train_test_split(X, | |
y, | |
test_size=0.10, | |
random_state=508) | |
############################################################################### | |
# Univariate Regression Analysis | |
############################################################################### | |
# Building a Regression Base | |
bwght_mage = sm.ols(formula = | |
"""bwght ~ birthweight['mage']+ | |
birthweight['fage']+birthweight['meduc']+ | |
birthweight['monpre']+birthweight['npvis']+ | |
birthweight['cigs']+birthweight['drink']+ | |
birthweight['feduc']+birthweight['fmaps']+ | |
birthweight['mwhte']""", | |
data = birthweight) | |
# Fitting Results | |
results = bwght_mage.fit() | |
# Printing Summary Statistics | |
print(results.summary()) | |
############################################################################### | |
#Running KNeightbor Regressor | |
############################################################################### | |
from sklearn.linear_model import LinearRegression | |
from sklearn.model_selection import train_test_split | |
X = birthweight.drop(['bwght', 'feduc', 'omaps', 'fmaps', | |
'male', 'foth', 'out_mage', 'out_npvis'], axis = 1) | |
y = birthweight.loc[:, 'bwght'] | |
X_train, X_test, y_train, y_test = train_test_split(X, | |
y, | |
test_size=0.10, | |
random_state=508) | |
# Instantiating a model with k = 5 | |
knn_reg = KNeighborsRegressor(algorithm = 'auto', | |
n_neighbors = 14) | |
# Fitting the model based on the training data | |
knn_reg.fit(X_train, y_train) | |
# Scoring the model | |
y_score = knn_reg.score(X_test, y_test) | |
# The score is directly comparable to R-Square | |
print(y_score) | |
#output: 0.4900115023210361 | |
############################################################################## | |
#Instantiating a KNN regressor object | |
############################################################################## | |
#Model which best fits to run the prediction score based on the smaller size of | |
#test dataset | |
X = birthweight.drop(['bwght', 'fage', 'fmaps', | |
'mblck', 'moth', 'fwhte', 'fblck', 'foth'], axis = 1) | |
y = birthweight.loc[:, 'bwght'] | |
X_train, X_test, y_train, y_test = train_test_split(X, | |
y, | |
test_size=0.10, | |
random_state=508) | |
knn_reg = KNeighborsRegressor(algorithm = 'auto', | |
n_neighbors = 5) | |
# Checking the type of this new object | |
type(knn_reg) | |
#Teaching (fitting) the algorithm based on the training data | |
knn_reg.fit(X_train, y_train) | |
#Predicting on the X_data that the model has never seen before | |
y_pred = knn_reg.predict(X_test) | |
# Printing out prediction values for each test observation | |
print(f""" | |
Test set predictions: | |
{y_pred[0:18]} | |
""") | |
#Calling the score method, which compares the predicted values to the actual values. | |
y_score = knn_reg.score(X_test, y_test) | |
# The score is directly comparable to R-Square | |
print(y_score) | |
#Output: 0.46591928466083826 | |
############################################################################### | |
#Using KNN on the Optimal Variables | |
############################################################################## | |
# Exact loop as before | |
training_accuracy = [] | |
test_accuracy = [] | |
neighbors_settings = range(1, 51) | |
for n_neighbors in neighbors_settings: | |
# build the model | |
clf = KNeighborsRegressor(n_neighbors = n_neighbors) | |
clf.fit(X_train, y_train) | |
# record training set accuracy | |
training_accuracy.append(clf.score(X_train, y_train)) | |
# record generalization accuracy | |
test_accuracy.append(clf.score(X_test, y_test)) | |
plt.plot(neighbors_settings, training_accuracy, label = "training accuracy") | |
plt.plot(neighbors_settings, test_accuracy, label = "test accuracy") | |
plt.ylabel("Accuracy") | |
plt.xlabel("n_neighbors") | |
plt.legend() | |
plt.show() | |
############################################################################## | |
print("The optimal number of neighbors is", \ | |
test_accuracy.index(max(test_accuracy)), \ | |
"with an optimal score of", \ | |
max(test_accuracy)) | |
#Output: The optimal number of neighbors is 4 with an optimal score of 0.4879509770000522 | |
############################################################################### | |
#Building a KNN model based on above Model | |
############################################################################### | |
X = birthweight.drop([ | |
'bwght', 'fage', 'feduc', 'omaps', 'fmaps', | |
'male', 'mwhte', 'mblck', 'moth'], axis = 1) | |
y = birthweight.loc[:, 'bwght'] | |
X_train, X_test, y_train, y_test = train_test_split(X, | |
y, | |
test_size=0.10, | |
random_state=508) | |
# Building a model with k = 5 | |
knn_reg = KNeighborsRegressor(algorithm = 'auto', | |
n_neighbors = 5) | |
# Fitting the model based on the training data | |
knn_reg_fit = knn_reg.fit(X_train, y_train) | |
# Scoring the model | |
y_score_knn_optimal = knn_reg.score(X_test, y_test) | |
# The score is directly comparable to R-Square | |
print(y_score_knn_optimal) | |
# Generating Predictions based on the optimal KNN model | |
knn_reg_optimal_pred = knn_reg_fit.predict(X_test) | |
#Output: 0.43301453370935306 | |
############################################################################## | |
#Predicting Now on the OLS Regression | |
############################################################################## | |
#Fixing the Data (X) and Target (y) | |
X = birthweight.drop(['bwght' | |
], axis = 1) | |
y = birthweight.loc[:, 'bwght'] | |
X_train, X_test, y_train, y_test = train_test_split(X, | |
y, | |
test_size=0.10, | |
random_state=508) | |
from sklearn.linear_model import LinearRegression | |
# Prepping the Model | |
lr = LinearRegression(fit_intercept = False) | |
# Fitting the model | |
lr_fit = lr.fit(X_train, y_train) | |
# Predictions | |
lr_pred = lr_fit.predict(X_test) | |
# Scoring the model | |
y_score_ols_optimal = lr_fit.score(X_test, y_test) | |
# The score is directly comparable to R-Square | |
print(y_score_ols_optimal) | |
#0.5958968692209615 | |
############################################################################### | |
# Prepping the Model Linear Regression | |
############################################################################### | |
X = birthweight.drop( | |
['bwght', 'monpre', 'omaps', 'fmaps', 'male', | |
'mwhte', 'mblck', 'moth'], | |
axis = 1) | |
y = birthweight.loc[:, 'bwght'] | |
X_train, X_test, y_train, y_test = train_test_split(X, | |
y, | |
test_size=0.10, | |
random_state=508) | |
lr_mod = LinearRegression(fit_intercept = False) | |
# Fitting the model | |
lr_mod_fit = lr_mod.fit(X_train, y_train) | |
# Predictions | |
lr_mod_pred = lr_mod_fit.predict(X_test) | |
#scoring the model | |
lr_mod_yscore = lr_mod_fit.score(X_test, y_test) | |
print(lr_mod_yscore) | |
#output:0.6198159480811648 | |
############################################################################## | |
############################################################################## | |
#Now we will export the predictions to excel sheet for submission | |
ish_Prediction_result = pd.DataFrame({'Actual' : y_test, | |
'Linear_Predicted': lr_mod_pred}) | |
ish_Prediction_result.to_excel("ishwor_Model_Prediction.xlsx") | |
############################################################################### |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment