Skip to content

Instantly share code, notes, and snippets.

@Benedikt1992
Last active March 2, 2018 07:59
Show Gist options
  • Save Benedikt1992/19c48e3fddd8cb8c979cd167f47ee907 to your computer and use it in GitHub Desktop.
Save Benedikt1992/19c48e3fddd8cb8c979cd167f47ee907 to your computer and use it in GitHub Desktop.
import os
import pandas as pd
import pickle
from keras.layers import BatchNormalization, Dropout, regularizers
from keras.layers import Dense
from keras.layers import Input
from keras.layers.merge import concatenate
from keras.models import Model
from keras.models import Sequential
from keras.utils import plot_model
from sklearn.model_selection import train_test_split
from sklearn.metrics import make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from keras.wrappers.scikit_learn import KerasRegressor
from keras.optimizers import Adam
#import matplotlib.pyplot as plt
from data.selected_features_pearson import selected_features_pearson
from data.selected_features_boosting import selected_features_boosting
TRAITS = set(['ope', 'con', 'ext', 'agr', 'neu'])
class ModelBuilder:
def __init__(self, number_of_features):
self.number_of_features = number_of_features
# https://machinelearningmastery.com/grid-search-hyperparameters-deep-learning-models-python-keras/
# http://nbviewer.jupyter.org/github/bockjo/deeplearning-and-related/blob/master/Entity_Embedding_Model.ipynb
# noch learning rate einbauen
# regularization/dropout einbauen und optimieren
# activ und co kann man auch einbauen
# batch size nicht optimieren. so groß wie möglich wählen (ram bedarf steigt dann)
def build_keras_model(self, activ="relu", init="uniform", loss='mean_squared_error', learning_rate=0.01, beta_1=0.9, beta_2=0.999, p=0.2, reg_lambda=0., **kwargs):
list_of_inputs = []
inps = Input(shape=(self.number_of_features,), name="contin")
dim = self.number_of_features * 10
x = Dense(dim, input_dim=self.number_of_features, activation=activ, kernel_initializer=init, kernel_regularizer=regularizers.l2(reg_lambda),
name="contin_d")(inps)
x = BatchNormalization()(x)
list_of_inputs.append(inps)
# Build Dense network on top of feature encoders
x = Dropout(p / 10)(x)
x = Dense(2 ** 10, activation=activ, kernel_initializer=init, kernel_regularizer=regularizers.l2(reg_lambda))(x)
x = Dense(2 ** 9, activation=activ, kernel_initializer=init, kernel_regularizer=regularizers.l2(reg_lambda))(x)
x = Dense(2 ** 8, activation=activ, kernel_initializer=init, kernel_regularizer=regularizers.l2(reg_lambda))(x)
x = Dense(2 ** 7, activation=activ, kernel_initializer=init, kernel_regularizer=regularizers.l2(reg_lambda))(x)
x = Dropout(p)(x)
out = Dense(1, activation=activ)(x)
current_model = Model(list_of_inputs, out)
optimizer = Adam(lr=learning_rate, beta_1=beta_1, beta_2=beta_2)
current_model.compile(loss=loss, optimizer=optimizer)
return current_model
def save(model, grid, history, model_name, model_png=True):
if not os.path.exists('data/NN'):
os.makedirs('data/NN')
# Save model architecture picture
if model_png:
plot_model(model, to_file='data/NN/' + model_name + '.png', show_shapes=True)
# Save model structure
model_json = model.to_json()
with open('data/NN/' + model_name + ".json", "w") as json_file:
json_file.write(model_json)
# Save model weights
model.save_weights('data/NN/' + model_name + "_weights.h5")
grid_results = [grid.cv_results_, grid.best_score_, grid.best_params_]
pickle.dump(grid_results, open("data/NN/{}_grid_results.pickle".format(model_name), "wb"))
# https: // keras.io / getting - started / faq / # how-can-i-save-a-keras-model
model.save("data/NN/{}_grid.keras".format(model_name))
pickle.dump(history.history, open("data/NN/{}_history.pickle".format(model_name), "wb"))
print(model_name + " has been saved to disk.")
return
# RMSE metric
def rmse(y_actual, y_predicted):
from sklearn.metrics import mean_squared_error
from math import sqrt
return sqrt(mean_squared_error(y_actual, y_predicted))
# r2 metric
def r2(y_actual, y_predicted):
from sklearn.metrics import r2_score
return r2_score(y_actual, y_predicted)
def train(trait,number_of_features, x, y, x_test, y_test):
scoring = {
'rmse': make_scorer(rmse, greater_is_better=False),
'r2': 'r2' # make_scorer(r2)
}
grid_params = [
{
'epochs': [100,200,500],
'batch_size': [25],
'activ': ['relu'],
'loss': ['mean_squared_error'],
'learning_rate': [0.001, 0.0001],
'beta_1': [0.9],
'beta_2': [0.999],
'p': [0., 0.2, 0.5],
'reg_lambda': [0.,0.01, 0.001]
}
]
builder = ModelBuilder(number_of_features)
sklearn_mock = KerasRegressor(build_fn=builder.build_keras_model, verbose=0)
kfold = KFold(n_splits=3, random_state=7)
grid = GridSearchCV(sklearn_mock,
cv=kfold,
n_jobs=-1,
param_grid=grid_params,
scoring=scoring,
refit='rmse', # refit best scoring model on whole training data with rmse metric
return_train_score=False, # for better performance
verbose=2 # print progress
)
grid.fit(x, y)
model = builder.build_keras_model(**grid.best_params_)
history = model.fit(x, y, verbose=0, validation_split=0.2, epochs=grid.best_params_["epochs"], batch_size=grid.best_params_["batch_size"])
save(model, grid, history, trait)
print("\n## Best score:", grid.best_score_)
print("## Best parameters:", grid.best_params_)
print("\n## Scoring on test set:")
s = grid.best_estimator_.score(x_test, y_test)
print(" score=", s)
def prepare_data(training_data, relative_test_size):
scores = training_data[[*TRAITS]]
features = training_data.drop(['userid'] + list(TRAITS), axis=1)
# split to train-validation and test sets
features_train, features_test, scores_train, scores_test = train_test_split(features, scores, test_size=relative_test_size, random_state=7)
return features_train, features_test, scores_train, scores_test
if __name__ == "__main__":
data = pd.read_csv("data/training_data.csv", index_col=0)
relative_test_size = 0.2
features_train, features_test, scores_train, scores_test = prepare_data(data, relative_test_size)
print("Number of training samples", len(features_train))
print("Number of test samples", len(features_test))
for trait in TRAITS:
for feature_set in [trait,'common', 'union']:
if feature_set == 'union':
selected_features_boosting['union'] = set(selected_features_boosting[trait]) | set(selected_features_boosting['common'])
selected_features_pearson['union'] = set(selected_features_pearson[trait]) | set(selected_features_pearson['common'])
# trait = 'neu'
feature_names = selected_features_pearson[feature_set]
name = trait + "_" + feature_set + "_" + "pearson"
if not os.path.exists("data/NN/{}_grid.keras".format(name)):
print("######################")
print("starting next training with pearson:", trait, feature_set)
print("######################")
# select features
X_train = features_train[[*feature_names]]
X_test = features_test[[*feature_names]]
y_train = scores_train[trait].values.flatten()
y_test = scores_test[trait].values.flatten()
# actual training
train(name, len(feature_names), X_train, y_train, X_test, y_test)
#---------------------------------------------------------------------------------
# trait = 'neu'
feature_names = selected_features_boosting[feature_set]
name = trait + "_" + feature_set + "_" + "boosting"
if not os.path.exists("data/NN/{}_grid.keras".format(name)):
print("######################")
print("starting next training with boosting:", trait, feature_set)
print("######################")
# select features
X_train = features_train[[*feature_names]]
X_test = features_test[[*feature_names]]
y_train = scores_train[trait].values.flatten()
y_test = scores_test[trait].values.flatten()
# actual training
train(name, len(feature_names), X_train, y_train, X_test, y_test)
# print(history)
# print(history.history['loss'])
# # summarize history for loss
# plt.plot(history.history['loss'])
# plt.plot(history.history['val_loss'])
# plt.title('model loss')
# plt.ylabel('loss')
# plt.xlabel('epoch')
# plt.legend(['train', 'test'], loc='upper left')
# plt.show()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment