Skip to content

Instantly share code, notes, and snippets.

@AviBanerjee13
Created February 14, 2016 17:56
Show Gist options
  • Save AviBanerjee13/1d3a73969d0741d31031 to your computer and use it in GitHub Desktop.
Save AviBanerjee13/1d3a73969d0741d31031 to your computer and use it in GitHub Desktop.
Data Analysis assignment - Random Forst
from pandas import Series, DataFrame
import pandas as pd
import numpy as np
import os
import matplotlib.pylab as plt
from sklearn.cross_validation import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
import sklearn.metrics
# Feature Importance
from sklearn import datasets
from sklearn.ensemble import ExtraTreesClassifier
"""
Data Engineering and Analysis
"""
#Load the dataset
data = pandas.read_csv('gapminder.csv')
# Replace empty values with NaN
data['polityscore'] = data['polityscore'].astype(np.object)
data['polityscore'] = data['polityscore'].replace(' ',np.nan)
data['polityscore'] = data['polityscore'].replace('',np.nan)
data['femaleemployrate'] = data['femaleemployrate'].astype(np.object)
data['femaleemployrate'] = data['femaleemployrate'].replace(' ',np.nan)
data['femaleemployrate'] = data['femaleemployrate'].replace('',np.nan)
data['internetuserate'] = data['internetuserate'].astype(np.object)
data['internetuserate'] = data['internetuserate'].replace(' ',np.nan)
data['internetuserate'] = data['internetuserate'].replace('',np.nan)
data['armedforcesrate'] = data['armedforcesrate'].astype(np.object)
data['armedforcesrate'] = data['armedforcesrate'].replace(' ',np.nan)
data['armedforcesrate'] = data['armedforcesrate'].replace('',np.nan)
data['lifeexpectancy'] = data['lifeexpectancy'].astype(np.object)
data['lifeexpectancy'] = data['lifeexpectancy'].replace(' ',np.nan)
data['lifeexpectancy'] = data['lifeexpectancy'].replace('',np.nan)
data['employrate'] = data['employrate'].astype(np.object)
data['employrate'] = data['employrate'].replace(' ',np.nan)
data['employrate'] = data['employrate'].replace('',np.nan)
data['incomeperperson'] = data['incomeperperson'].astype(np.object)
data['incomeperperson'] = data['incomeperperson'].replace('',np.nan)
data['incomeperperson'] = data['incomeperperson'].replace(' ',np.nan)
data['urbanrate'] = data['urbanrate'].astype(np.object)
data['urbanrate'] = data['urbanrate'].replace('',np.nan)
data['urbanrate'] = data['urbanrate'].replace(' ',np.nan)
# Target Variable
data['polityscore'] = pd.to_numeric(data['polityscore'], errors='coerce')
# Predictor Variables
data['incomeperperson'] = pd.to_numeric(data['incomeperperson'], errors='coerce')
data['employrate'] = pd.to_numeric(data['employrate'], errors='coerce')
data['femaleemployrate'] = pd.to_numeric(data['femaleemployrate'], errors='coerce')
data['lifeexpectancy'] = pd.to_numeric(data['lifeexpectancy'], errors='coerce')
data['armedforcesrate'] = pd.to_numeric(data['armedforcesrate'], errors='coerce')
data['internetuserate'] = pd.to_numeric(data['internetuserate'], errors='coerce')
data['urbanrate'] = pd.to_numeric(data['urbanrate'], errors='coerce')
data_clean = data.dropna()
#%%
# Data Management: Polity Score is chosen as the response variable.
# -10 to -5: Autocratic, -5 to 5: Anocratic and 5 to 10: Democratic
# Here, Anocratic countries are coded as 0 and countries with political
# biases are coded as 1. Hence we have out bivariate response variable
def RecodePolityScore (row):
if row['polityscore']<=-5 or row['polityscore']>=5 :
return 1
elif row['polityscore']>-5 and row['polityscore']<5 :
return 0
# Check that recoding is done
data_clean['polityscore'] = data_clean.apply(lambda row: RecodePolityScore(row),axis=1)
#%%
"""
Modeling and Prediction
"""
#Split into training and testing sets
predictors = data_clean[[
'incomeperperson',
'internetuserate',
'femaleemployrate',
'armedforcesrate',
'employrate',
'lifeexpectancy',
'urbanrate'
]]
targets = data_clean.polityscore
pred_train, pred_test, tar_train, tar_test = train_test_split(predictors, targets, test_size=.4)
print("Shape")
print("-----")
print(pred_train.shape)
print(pred_test.shape)
print(tar_train.shape)
print(tar_test.shape)
#Build model on training data
from sklearn.ensemble import RandomForestClassifier
classifier=RandomForestClassifier(n_estimators=25)
classifier=classifier.fit(pred_train,tar_train)
predictions=classifier.predict(pred_test)
print("Predictions")
print("-----------")
print(sklearn.metrics.confusion_matrix(tar_test,predictions))
print("Accuracy of the Model")
print("---------------------")
print(sklearn.metrics.accuracy_score(tar_test, predictions))
# fit an Extra Trees model to the data
model = ExtraTreesClassifier()
model.fit(pred_train,tar_train)
# display the relative importance of each attribute
print(model.feature_importances_)
"""
Running a different number of trees and see the effect
of that on the accuracy of the prediction
"""
trees=range(25)
accuracy=np.zeros(25)
for idx in range(len(trees)):
classifier=RandomForestClassifier(n_estimators=idx + 1)
classifier=classifier.fit(pred_train,tar_train)
predictions=classifier.predict(pred_test)
accuracy[idx]=sklearn.metrics.accuracy_score(tar_test, predictions)
plt.cla()
plt.plot(trees, accuracy)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment