Skip to content

Instantly share code, notes, and snippets.

@thiagomarzagao
Created May 8, 2014 19:57
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save thiagomarzagao/22cb3f26a750c9c7c2d3 to your computer and use it in GitHub Desktop.
Save thiagomarzagao/22cb3f26a750c9c7c2d3 to your computer and use it in GitHub Desktop.
import os
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor
# set input path (path to LSA or LDA results)
ipath = '/home/ubuntu/results/lsa/results.csv'
# path to the UDS dataset
udsfile = '/home/ubuntu/uds.csv'
# set model parameters
max_features = 0.333 # choices: 0.333, 'sqrt', 'auto'
min_samples_split = 5 # choices: 2, 5
n_repeats = 10000
learning_rate = 1.0 # used in AdaBoost only
# make training set
full_uds = pd.read_csv(udsfile, usecols = [0, 1, 3])
refset = full_uds[full_uds['year'] > 1991]
refset = {refset.iat[row, 0] + str(refset.iat[row, 1]): refset.iat[row, 2]
for row in range(len(refset)) if refset.iat[row, 1] in [1992]}
refset = pd.DataFrame(refset.items(), columns = ['doc', 'UDS'])
refset.set_index('doc', inplace = True)
# build full dataset
data = ipath
data = pd.read_csv(data, index_col = [0])
data = pd.merge(data, refset, left_index = True, right_index = True, how = 'outer')
# make X, y, X_new
X = data[data['UDS'].notnull()][range(len(data.columns) - 1)].fillna(0)
y = np.ravel(refset)
X_new = data[data['UDS'].isnull()][range(len(data.columns) - 1)].fillna(0)
# make validation set
valset = full_uds[full_uds['year'] > 1992]
valset = {valset.iat[row, 0] + str(valset.iat[row, 1]): valset.iat[row, 2]
for row in range(len(valset))}
valset = pd.DataFrame(valset.items(), columns = ['doc', 'UDS'])
valset.set_index('doc', inplace = True)
# define framework function
def predict(data, X, y, X_new, valset, max_features, min_samples_split, n_repeats):
## set models
# decision tree
model0 = DecisionTreeRegressor(max_features = max_features,
min_samples_split = min_samples_split)
# random forest
model1 = RandomForestRegressor(n_estimators = n_repeats,
max_features = max_features,
min_samples_split = min_samples_split)
# extreme random forest
model2 = ExtraTreesRegressor(n_estimators = n_repeats,
max_features = max_features,
min_samples_split = min_samples_split)
# AdaBoost
model3 = AdaBoostRegressor(DecisionTreeRegressor(
max_features = max_features,
min_samples_split = min_samples_split),
n_estimators = n_repeats,
learning_rate = learning_rate)
## run each model
model_names = ['decision_tree', 'random_forest', 'extra_trees', 'ADAboost']
for model, model_name in zip([model0, model1, model2, model3], model_names):
# estimate and predict
model.fit(X, y)
predictions = model.predict(X_new)
X_new['ADS'] = predictions
# check performance
contrast = pd.merge(X_new,
valset,
left_index = True,
right_index = True,
how = 'inner')
contrast = contrast[['ADS', 'UDS']]
print ''
print 'RESULTS W/ ', model_name
print contrast.corr()
del X_new['ADS']
predict(data, X, y, X_new, valset, max_features, min_samples_split, n_repeats)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment