Create a gist now

Instantly share code, notes, and snippets.

What would you like to do?
import os
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor
# set input path (path to LSA or LDA results)
ipath = '/home/ubuntu/results/lsa/results.csv'
# path to the UDS dataset
udsfile = '/home/ubuntu/uds.csv'
# set model parameters
max_features = 0.333 # choices: 0.333, 'sqrt', 'auto'
min_samples_split = 5 # choices: 2, 5
n_repeats = 10000
learning_rate = 1.0 # used in AdaBoost only
# make training set
full_uds = pd.read_csv(udsfile, usecols = [0, 1, 3])
refset = full_uds[full_uds['year'] > 1991]
refset = {refset.iat[row, 0] + str(refset.iat[row, 1]): refset.iat[row, 2]
for row in range(len(refset)) if refset.iat[row, 1] in [1992]}
refset = pd.DataFrame(refset.items(), columns = ['doc', 'UDS'])
refset.set_index('doc', inplace = True)
# build full dataset
data = ipath
data = pd.read_csv(data, index_col = [0])
data = pd.merge(data, refset, left_index = True, right_index = True, how = 'outer')
# make X, y, X_new
X = data[data['UDS'].notnull()][range(len(data.columns) - 1)].fillna(0)
y = np.ravel(refset)
X_new = data[data['UDS'].isnull()][range(len(data.columns) - 1)].fillna(0)
# make validation set
valset = full_uds[full_uds['year'] > 1992]
valset = {valset.iat[row, 0] + str(valset.iat[row, 1]): valset.iat[row, 2]
for row in range(len(valset))}
valset = pd.DataFrame(valset.items(), columns = ['doc', 'UDS'])
valset.set_index('doc', inplace = True)
# define framework function
def predict(data, X, y, X_new, valset, max_features, min_samples_split, n_repeats):
## set models
# decision tree
model0 = DecisionTreeRegressor(max_features = max_features,
min_samples_split = min_samples_split)
# random forest
model1 = RandomForestRegressor(n_estimators = n_repeats,
max_features = max_features,
min_samples_split = min_samples_split)
# extreme random forest
model2 = ExtraTreesRegressor(n_estimators = n_repeats,
max_features = max_features,
min_samples_split = min_samples_split)
# AdaBoost
model3 = AdaBoostRegressor(DecisionTreeRegressor(
max_features = max_features,
min_samples_split = min_samples_split),
n_estimators = n_repeats,
learning_rate = learning_rate)
## run each model
model_names = ['decision_tree', 'random_forest', 'extra_trees', 'ADAboost']
for model, model_name in zip([model0, model1, model2, model3], model_names):
# estimate and predict
model.fit(X, y)
predictions = model.predict(X_new)
X_new['ADS'] = predictions
# check performance
contrast = pd.merge(X_new,
valset,
left_index = True,
right_index = True,
how = 'inner')
contrast = contrast[['ADS', 'UDS']]
print ''
print 'RESULTS W/ ', model_name
print contrast.corr()
del X_new['ADS']
predict(data, X, y, X_new, valset, max_features, min_samples_split, n_repeats)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment