Skip to content

Instantly share code, notes, and snippets.

@mick001
Last active August 27, 2015 23:02
Show Gist options
  • Save mick001/e2c94b0f6e20b356f54d to your computer and use it in GitHub Desktop.
Save mick001/e2c94b0f6e20b356f54d to your computer and use it in GitHub Desktop.
Example on how to start with scikit-learn and use a Random Forest classifier for a classification task
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import train_test_split
import pandas as pd
import os
# Loading data
data = pd.read_csv('data.csv')
# Encoding categorical features into numerical
buying_map = {'vhigh':4,'high':3,'med':2,'low':1}
maint_map = {'vhigh':4,'high':3,'med':2,'low':1}
doors_map = {'5more':6}
persons_map = {'more':5}
lug_boot_map = {'small':1,'med':2,'big':3}
safety_map = {'high':3,'med':2,'low':1}
class_map = {'vgood':4,'good':3,'acc':2,'unacc':1}
# Mapping dictionary
dict_map = dict()
dict_map['buying'] = buying_map
dict_map['maint'] = maint_map
dict_map['doors'] = doors_map
dict_map['persons'] = persons_map
dict_map['lug_boot'] = lug_boot_map
dict_map['safety'] = safety_map
dict_map['class'] = class_map
data = data.replace(dict_map)
# Be sure that the data is of type int (float is fine too)
data = data.applymap(int)
# Splitting X and y
X = data[data.keys()[:-1]].as_matrix()
y = data['class'].as_matrix()
# Train test splitting
X_train, X_test, y_train, y_test = train_test_split(X,
y,test_size=0.3
,random_state=0)
# Model fitting
forest = RandomForestClassifier(n_estimators=500)
forest.fit(X_train,y_train)
# Model score
print(forest.score(X_test,y_test))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment