Skip to content

Instantly share code, notes, and snippets.

@Benedikt1992
Last active March 2, 2018 07:56
Show Gist options
  • Save Benedikt1992/c6f7ef175e957b011d1d13bf176818f8 to your computer and use it in GitHub Desktop.
Save Benedikt1992/c6f7ef175e957b011d1d13bf176818f8 to your computer and use it in GitHub Desktop.
import os
import pickle
import pandas as pd
from sklearn.metrics import make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from data.selected_features_boosting import selected_features_boosting
from data.selected_features_pearson import selected_features_pearson
TRAITS_TO_TRAIN = ['ope', 'con', 'ext', 'agr', 'neu']
TEST_SIZE = 0.2
TRAITS = set(['ope', 'con', 'ext', 'agr', 'neu'])
data = pd.read_csv("data/training_data.csv", index_col=0)
selected_features = {}
# RMSE metric
def rmse(y_actual, y_predicted):
from sklearn.metrics import mean_squared_error
from math import sqrt
return sqrt(mean_squared_error(y_actual, y_predicted))
# r2 metric
def r2(y_actual, y_predicted):
from sklearn.metrics import r2_score
return r2_score(y_actual, y_predicted)
def boostwith(X_train, X_test, y_train, y_test, trait, load_from_disk=False):
scoring = {
'rmse': make_scorer(rmse, greater_is_better=False),
'r2': 'r2' # make_scorer(r2)
}
kfold = KFold(n_splits=3, random_state=7)
# LEARN
if not load_from_disk:
xgb_model = XGBRegressor()
clf = GridSearchCV(xgb_model,
{'max_depth': [2, 3, 4],
'n_estimators': [300, 500],
'learning_rate': [0.01, 0.1, 0.001],
},
n_jobs=-1,
cv=kfold,
scoring=scoring,
refit='rmse', # refit best scoring model on whole training data with rmse metric
return_train_score=False, # for better performance
verbose=2 # print progress
)
clf.fit(X_train, y_train)
print("Best score: {}".format(clf.best_score_))
print("Best Params: {}".format(clf.best_params_))
print("\n## Scoring on test set:")
s = clf.best_estimator_.score(X_test, y_test)
print(" score=", s)
if not os.path.exists('data/xgboost-models'):
os.makedirs('data/xgboost-models')
pickle.dump(clf, open("data/xgboost-models/{}_boosting.model".format(trait), "wb"))
else:
clf = pickle.load(open("data/xgboost-models/{}_boosting.model".format(trait), "rb"))
def prepare_data(training_data, relative_test_size):
scores = training_data[[*TRAITS]]
features = training_data.drop(['userid'] + list(TRAITS), axis=1)
# split to train-validation and test sets
features_train, features_test, scores_train, scores_test = train_test_split(features, scores, test_size=relative_test_size, random_state=7)
return features_train, features_test, scores_train, scores_test
if __name__ == "__main__":
data = pd.read_csv("data/training_data.csv", index_col=0)
relative_test_size = TEST_SIZE
features_train, features_test, scores_train, scores_test = prepare_data(data, relative_test_size)
print("Number of training samples", len(features_train))
print("Number of test samples", len(features_test))
for trait in TRAITS_TO_TRAIN:
for feature_set in [trait, 'common', 'union']:
if feature_set == 'union':
selected_features_boosting['union'] = set(selected_features_boosting[trait]) | set(selected_features_boosting['common'])
selected_features_pearson['union'] = set(selected_features_pearson[trait]) | set(selected_features_pearson['common'])
# trait = 'neu'
feature_names = selected_features_pearson[feature_set]
name = trait + "_" + feature_set + "_" + "pearson"
print("######################")
print("starting next training with pearson:", trait, feature_set)
print("######################")
# select features
X_train = features_train[[*feature_names]]
X_test = features_test[[*feature_names]]
y_train = scores_train[trait].values.flatten()
y_test = scores_test[trait].values.flatten()
# actual training
boostwith(X_train, X_test, y_train, y_test, name)
# ---------------------------------------------------------------------------------
# trait = 'neu'
feature_names = selected_features_boosting[feature_set]
name = trait + "_" + feature_set + "_" + "boosting"
print("######################")
print("starting next training with boosting:", trait, feature_set)
print("######################")
# select features
X_train = features_train[[*feature_names]]
X_test = features_test[[*feature_names]]
y_train = scores_train[trait].values.flatten()
y_test = scores_test[trait].values.flatten()
# actual training
boostwith(X_train, X_test, y_train, y_test, name)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment