Created
May 8, 2014 19:57
-
-
Save thiagomarzagao/22cb3f26a750c9c7c2d3 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import numpy as np | |
import pandas as pd | |
from sklearn.ensemble import RandomForestRegressor | |
from sklearn.ensemble import ExtraTreesRegressor | |
from sklearn.tree import DecisionTreeRegressor | |
from sklearn.ensemble import AdaBoostRegressor | |
# set input path (path to LSA or LDA results) | |
ipath = '/home/ubuntu/results/lsa/results.csv' | |
# path to the UDS dataset | |
udsfile = '/home/ubuntu/uds.csv' | |
# set model parameters | |
max_features = 0.333 # choices: 0.333, 'sqrt', 'auto' | |
min_samples_split = 5 # choices: 2, 5 | |
n_repeats = 10000 | |
learning_rate = 1.0 # used in AdaBoost only | |
# make training set | |
full_uds = pd.read_csv(udsfile, usecols = [0, 1, 3]) | |
refset = full_uds[full_uds['year'] > 1991] | |
refset = {refset.iat[row, 0] + str(refset.iat[row, 1]): refset.iat[row, 2] | |
for row in range(len(refset)) if refset.iat[row, 1] in [1992]} | |
refset = pd.DataFrame(refset.items(), columns = ['doc', 'UDS']) | |
refset.set_index('doc', inplace = True) | |
# build full dataset | |
data = ipath | |
data = pd.read_csv(data, index_col = [0]) | |
data = pd.merge(data, refset, left_index = True, right_index = True, how = 'outer') | |
# make X, y, X_new | |
X = data[data['UDS'].notnull()][range(len(data.columns) - 1)].fillna(0) | |
y = np.ravel(refset) | |
X_new = data[data['UDS'].isnull()][range(len(data.columns) - 1)].fillna(0) | |
# make validation set | |
valset = full_uds[full_uds['year'] > 1992] | |
valset = {valset.iat[row, 0] + str(valset.iat[row, 1]): valset.iat[row, 2] | |
for row in range(len(valset))} | |
valset = pd.DataFrame(valset.items(), columns = ['doc', 'UDS']) | |
valset.set_index('doc', inplace = True) | |
# define framework function | |
def predict(data, X, y, X_new, valset, max_features, min_samples_split, n_repeats): | |
## set models | |
# decision tree | |
model0 = DecisionTreeRegressor(max_features = max_features, | |
min_samples_split = min_samples_split) | |
# random forest | |
model1 = RandomForestRegressor(n_estimators = n_repeats, | |
max_features = max_features, | |
min_samples_split = min_samples_split) | |
# extreme random forest | |
model2 = ExtraTreesRegressor(n_estimators = n_repeats, | |
max_features = max_features, | |
min_samples_split = min_samples_split) | |
# AdaBoost | |
model3 = AdaBoostRegressor(DecisionTreeRegressor( | |
max_features = max_features, | |
min_samples_split = min_samples_split), | |
n_estimators = n_repeats, | |
learning_rate = learning_rate) | |
## run each model | |
model_names = ['decision_tree', 'random_forest', 'extra_trees', 'ADAboost'] | |
for model, model_name in zip([model0, model1, model2, model3], model_names): | |
# estimate and predict | |
model.fit(X, y) | |
predictions = model.predict(X_new) | |
X_new['ADS'] = predictions | |
# check performance | |
contrast = pd.merge(X_new, | |
valset, | |
left_index = True, | |
right_index = True, | |
how = 'inner') | |
contrast = contrast[['ADS', 'UDS']] | |
print '' | |
print 'RESULTS W/ ', model_name | |
print contrast.corr() | |
del X_new['ADS'] | |
predict(data, X, y, X_new, valset, max_features, min_samples_split, n_repeats) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment