Skip to content

Instantly share code, notes, and snippets.

@jmsword
Created February 11, 2017 20:04
Show Gist options
  • Save jmsword/61b669321a8f17845ce2a1965d8554af to your computer and use it in GitHub Desktop.
Save jmsword/61b669321a8f17845ce2a1965d8554af to your computer and use it in GitHub Desktop.
Random Forest
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
import sklearn.metrics as skm
import pylab as pl
#Read in the column names for the dataset
feat = pd.read_csv('features.txt', delim_whitespace=True, header=None, index_col=False)
#Name the columns to isolate all dataset column names in one column
feat.columns=['column 1', 'column 2']
#Clean up column names
feat['column 2'] = feat['column 2'].str.replace('-', '')
feat['column 2'] = feat['column 2'].str.replace('(', '')
feat['column 2'] = feat['column 2'].str.replace(')', '')
feat['column 2'] = feat['column 2'].str.replace(',', '')
feat['column 2'] = feat['column 2'].str.replace('BodyBody', '')
feat['column 2'] = feat['column 2'].str.replace('Body', '')
feat['column 2'] = feat['column 2'].str.replace('Mag', '')
feat['column 2'] = feat['column 2'].str.replace('mean', 'Mean')
feat['column 2'] = feat['column 2'].str.replace('std', 'STD')
#Read in raw data
#The 'y_train' and 'subject_train' files are in ASCII character format
X_train = pd.read_csv('train/X_train.txt', delim_whitespace=True, header=None, index_col=False)
y_train = pd.read_csv('train/y_train.txt', header=None, index_col=False)
subjects = pd.read_csv('train/subject_train.txt', header=None, index_col=False)
#Isolate clean column names from 'feat' DataFrame
features=feat['column 2']
## Assign column names to each raw data file
X_train.columns = features
y_train.columns = ['Activity']
subjects.columns = ['Subject']
#Remove duplicate columns from X_train file
X_train = X_train.loc[:,~X_train.columns.duplicated()]
#merge all dataframes together
data = pd.merge(y_train, X_train, left_index=True, right_index=True)
data = pd.merge(data, subjects, left_index=True, right_index=True)
#make 'Activity' a categorical variable
data['Activity'] = pd.Categorical(data['Activity']).codes
#Separate testing and training data
train = data.query('Subject >= 27')
test = data.query('Subject <= 6')
validation = data.query('(Subject >= 21) & (Subject < 27)')
#Fit random forest to training set
train_target = train['Activity']
#Remove unnecessary columns
train_data = train.ix[:,1:-2]
rfc = RandomForestClassifier(n_estimators=50, oob_score=True)
rfc.fit(train_data, train_target)
print('Out of the box score: ', rfc.oob_score_)
#Most important features
importances = rfc.feature_importances_
indices = np.argsort(importances)[::-1]
print('Top 10 Features:')
for i in range(10):
print("%d. feature %d (%f)" % (i + 1, indices[i], importances[indices[i]]))
#Not sure how to get the column names to print, rather than their index
#Define validation set and make accuracy score predictions
val_target = validation['Activity']
#Remove unnecessary columns
val_data = validation.ix[:,1:-2]
#Calculate accuracy score
val_pred = rfc.predict(val_data)
#Define test set and make accuracy score predictions
test_target = test['Activity']
#Remove unnecessary columns
test_data = test.ix[:,1:-2]
#Calculate accuracy score
test_pred = rfc.predict(test_data)
#Print accuracy scores
print("Validation set Mean Accuracy Score: %f" % (rfc.score(val_data, val_target)))
print("Test set Mean Accuracy Score: %f" % (rfc.score(test_data, test_target)))
#Confusion matrix
test_cm = skm.confusion_matrix(test_target, test_pred)
pl.matshow(test_cm)
pl.title('Test data confusion matrix')
pl.colorbar()
pl.show()
#Print accuracy, precision, recall, and f1 scores for test set
print("Accuracy = %f" %(skm.accuracy_score(test_target, test_pred)))
print("Precision = %f" %(skm.precision_score(test_target, test_pred, average='weighted')))
print("Recall = %f" %(skm.recall_score(test_target, test_pred, average='weighted')))
print("F1 score = %f" %(skm.f1_score(test_target, test_pred, average='weighted')))
###### Note: I had to work off of another students project to figure this assignmant out fully.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment