Created
February 14, 2016 17:56
-
-
Save AviBanerjee13/1d3a73969d0741d31031 to your computer and use it in GitHub Desktop.
Data Analysis assignment - Random Forst
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from pandas import Series, DataFrame | |
import pandas as pd | |
import numpy as np | |
import os | |
import matplotlib.pylab as plt | |
from sklearn.cross_validation import train_test_split | |
from sklearn.tree import DecisionTreeClassifier | |
from sklearn.metrics import classification_report | |
import sklearn.metrics | |
# Feature Importance | |
from sklearn import datasets | |
from sklearn.ensemble import ExtraTreesClassifier | |
""" | |
Data Engineering and Analysis | |
""" | |
#Load the dataset | |
data = pandas.read_csv('gapminder.csv') | |
# Replace empty values with NaN | |
data['polityscore'] = data['polityscore'].astype(np.object) | |
data['polityscore'] = data['polityscore'].replace(' ',np.nan) | |
data['polityscore'] = data['polityscore'].replace('',np.nan) | |
data['femaleemployrate'] = data['femaleemployrate'].astype(np.object) | |
data['femaleemployrate'] = data['femaleemployrate'].replace(' ',np.nan) | |
data['femaleemployrate'] = data['femaleemployrate'].replace('',np.nan) | |
data['internetuserate'] = data['internetuserate'].astype(np.object) | |
data['internetuserate'] = data['internetuserate'].replace(' ',np.nan) | |
data['internetuserate'] = data['internetuserate'].replace('',np.nan) | |
data['armedforcesrate'] = data['armedforcesrate'].astype(np.object) | |
data['armedforcesrate'] = data['armedforcesrate'].replace(' ',np.nan) | |
data['armedforcesrate'] = data['armedforcesrate'].replace('',np.nan) | |
data['lifeexpectancy'] = data['lifeexpectancy'].astype(np.object) | |
data['lifeexpectancy'] = data['lifeexpectancy'].replace(' ',np.nan) | |
data['lifeexpectancy'] = data['lifeexpectancy'].replace('',np.nan) | |
data['employrate'] = data['employrate'].astype(np.object) | |
data['employrate'] = data['employrate'].replace(' ',np.nan) | |
data['employrate'] = data['employrate'].replace('',np.nan) | |
data['incomeperperson'] = data['incomeperperson'].astype(np.object) | |
data['incomeperperson'] = data['incomeperperson'].replace('',np.nan) | |
data['incomeperperson'] = data['incomeperperson'].replace(' ',np.nan) | |
data['urbanrate'] = data['urbanrate'].astype(np.object) | |
data['urbanrate'] = data['urbanrate'].replace('',np.nan) | |
data['urbanrate'] = data['urbanrate'].replace(' ',np.nan) | |
# Target Variable | |
data['polityscore'] = pd.to_numeric(data['polityscore'], errors='coerce') | |
# Predictor Variables | |
data['incomeperperson'] = pd.to_numeric(data['incomeperperson'], errors='coerce') | |
data['employrate'] = pd.to_numeric(data['employrate'], errors='coerce') | |
data['femaleemployrate'] = pd.to_numeric(data['femaleemployrate'], errors='coerce') | |
data['lifeexpectancy'] = pd.to_numeric(data['lifeexpectancy'], errors='coerce') | |
data['armedforcesrate'] = pd.to_numeric(data['armedforcesrate'], errors='coerce') | |
data['internetuserate'] = pd.to_numeric(data['internetuserate'], errors='coerce') | |
data['urbanrate'] = pd.to_numeric(data['urbanrate'], errors='coerce') | |
data_clean = data.dropna() | |
#%% | |
# Data Management: Polity Score is chosen as the response variable. | |
# -10 to -5: Autocratic, -5 to 5: Anocratic and 5 to 10: Democratic | |
# Here, Anocratic countries are coded as 0 and countries with political | |
# biases are coded as 1. Hence we have out bivariate response variable | |
def RecodePolityScore (row): | |
if row['polityscore']<=-5 or row['polityscore']>=5 : | |
return 1 | |
elif row['polityscore']>-5 and row['polityscore']<5 : | |
return 0 | |
# Check that recoding is done | |
data_clean['polityscore'] = data_clean.apply(lambda row: RecodePolityScore(row),axis=1) | |
#%% | |
""" | |
Modeling and Prediction | |
""" | |
#Split into training and testing sets | |
predictors = data_clean[[ | |
'incomeperperson', | |
'internetuserate', | |
'femaleemployrate', | |
'armedforcesrate', | |
'employrate', | |
'lifeexpectancy', | |
'urbanrate' | |
]] | |
targets = data_clean.polityscore | |
pred_train, pred_test, tar_train, tar_test = train_test_split(predictors, targets, test_size=.4) | |
print("Shape") | |
print("-----") | |
print(pred_train.shape) | |
print(pred_test.shape) | |
print(tar_train.shape) | |
print(tar_test.shape) | |
#Build model on training data | |
from sklearn.ensemble import RandomForestClassifier | |
classifier=RandomForestClassifier(n_estimators=25) | |
classifier=classifier.fit(pred_train,tar_train) | |
predictions=classifier.predict(pred_test) | |
print("Predictions") | |
print("-----------") | |
print(sklearn.metrics.confusion_matrix(tar_test,predictions)) | |
print("Accuracy of the Model") | |
print("---------------------") | |
print(sklearn.metrics.accuracy_score(tar_test, predictions)) | |
# fit an Extra Trees model to the data | |
model = ExtraTreesClassifier() | |
model.fit(pred_train,tar_train) | |
# display the relative importance of each attribute | |
print(model.feature_importances_) | |
""" | |
Running a different number of trees and see the effect | |
of that on the accuracy of the prediction | |
""" | |
trees=range(25) | |
accuracy=np.zeros(25) | |
for idx in range(len(trees)): | |
classifier=RandomForestClassifier(n_estimators=idx + 1) | |
classifier=classifier.fit(pred_train,tar_train) | |
predictions=classifier.predict(pred_test) | |
accuracy[idx]=sklearn.metrics.accuracy_score(tar_test, predictions) | |
plt.cla() | |
plt.plot(trees, accuracy) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment