Benedikt1992/learning_xgboost.py

## learning_xgboost.py
import os
import pickle

import pandas as pd
from sklearn.metrics import make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor

from data.selected_features_boosting import selected_features_boosting
from data.selected_features_pearson import selected_features_pearson

TRAITS_TO_TRAIN = ['ope', 'con', 'ext', 'agr', 'neu']
TEST_SIZE = 0.2
TRAITS = set(['ope', 'con', 'ext', 'agr', 'neu'])

data = pd.read_csv("data/training_data.csv", index_col=0)
selected_features = {}


# RMSE metric
def rmse(y_actual, y_predicted):
    from sklearn.metrics import mean_squared_error
    from math import sqrt
    return sqrt(mean_squared_error(y_actual, y_predicted))


# r2 metric
def r2(y_actual, y_predicted):
    from sklearn.metrics import r2_score
    return r2_score(y_actual, y_predicted)

def boostwith(X_train, X_test, y_train, y_test, trait, load_from_disk=False):
    scoring = {
        'rmse': make_scorer(rmse, greater_is_better=False),
        'r2': 'r2'  # make_scorer(r2)
    }

    kfold = KFold(n_splits=3, random_state=7)
    # LEARN
    if not load_from_disk:
        xgb_model = XGBRegressor()
        clf = GridSearchCV(xgb_model,
                           {'max_depth': [2, 3, 4],
                            'n_estimators': [300, 500],
                            'learning_rate': [0.01, 0.1, 0.001],
                            },
                           n_jobs=-1,
                           cv=kfold,
                           scoring=scoring,
                           refit='rmse',  # refit best scoring model on whole training data with rmse metric
                           return_train_score=False,  # for better performance
                           verbose=2  # print progress
                           )

        clf.fit(X_train, y_train)
        print("Best score: {}".format(clf.best_score_))
        print("Best Params: {}".format(clf.best_params_))
        print("\n## Scoring on test set:")
        s = clf.best_estimator_.score(X_test, y_test)
        print("   score=", s)
        if not os.path.exists('data/xgboost-models'):
            os.makedirs('data/xgboost-models')
        pickle.dump(clf, open("data/xgboost-models/{}_boosting.model".format(trait), "wb"))
    else:
        clf = pickle.load(open("data/xgboost-models/{}_boosting.model".format(trait), "rb"))

def prepare_data(training_data, relative_test_size):
    scores = training_data[[*TRAITS]]
    features = training_data.drop(['userid'] + list(TRAITS), axis=1)

    # split to train-validation and test sets
    features_train, features_test, scores_train, scores_test = train_test_split(features, scores, test_size=relative_test_size, random_state=7)
    return features_train, features_test, scores_train, scores_test


if __name__ == "__main__":
    data = pd.read_csv("data/training_data.csv", index_col=0)

    relative_test_size = TEST_SIZE
    features_train, features_test, scores_train, scores_test = prepare_data(data, relative_test_size)
    print("Number of training samples", len(features_train))
    print("Number of test samples", len(features_test))

    for trait in TRAITS_TO_TRAIN:
        for feature_set in [trait, 'common', 'union']:
            if feature_set == 'union':
                selected_features_boosting['union'] = set(selected_features_boosting[trait]) | set(selected_features_boosting['common'])
                selected_features_pearson['union'] = set(selected_features_pearson[trait]) | set(selected_features_pearson['common'])


            # trait = 'neu'
            feature_names = selected_features_pearson[feature_set]
            name = trait + "_" + feature_set + "_" + "pearson"

            print("######################")
            print("starting next training with pearson:", trait, feature_set)
            print("######################")
            # select features
            X_train = features_train[[*feature_names]]
            X_test = features_test[[*feature_names]]
            y_train = scores_train[trait].values.flatten()
            y_test = scores_test[trait].values.flatten()

            # actual training
            boostwith(X_train, X_test, y_train, y_test, name)
            # ---------------------------------------------------------------------------------
            # trait = 'neu'
            feature_names = selected_features_boosting[feature_set]
            name = trait + "_" + feature_set + "_" + "boosting"

            print("######################")
            print("starting next training with boosting:", trait, feature_set)
            print("######################")
            # select features
            X_train = features_train[[*feature_names]]
            X_test = features_test[[*feature_names]]
            y_train = scores_train[trait].values.flatten()
            y_test = scores_test[trait].values.flatten()

            # actual training
            boostwith(X_train, X_test, y_train, y_test, name)
	import os
	import pickle

	import pandas as pd
	from sklearn.metrics import make_scorer
	from sklearn.model_selection import GridSearchCV
	from sklearn.model_selection import KFold
	from sklearn.model_selection import train_test_split
	from xgboost import XGBRegressor

	from data.selected_features_boosting import selected_features_boosting
	from data.selected_features_pearson import selected_features_pearson

	TRAITS_TO_TRAIN = ['ope', 'con', 'ext', 'agr', 'neu']
	TEST_SIZE = 0.2
	TRAITS = set(['ope', 'con', 'ext', 'agr', 'neu'])

	data = pd.read_csv("data/training_data.csv", index_col=0)
	selected_features = {}


	# RMSE metric
	def rmse(y_actual, y_predicted):
	from sklearn.metrics import mean_squared_error
	from math import sqrt
	return sqrt(mean_squared_error(y_actual, y_predicted))


	# r2 metric
	def r2(y_actual, y_predicted):
	from sklearn.metrics import r2_score
	return r2_score(y_actual, y_predicted)

	def boostwith(X_train, X_test, y_train, y_test, trait, load_from_disk=False):
	scoring = {
	'rmse': make_scorer(rmse, greater_is_better=False),
	'r2': 'r2' # make_scorer(r2)
	}

	kfold = KFold(n_splits=3, random_state=7)
	# LEARN
	if not load_from_disk:
	xgb_model = XGBRegressor()
	clf = GridSearchCV(xgb_model,
	{'max_depth': [2, 3, 4],
	'n_estimators': [300, 500],
	'learning_rate': [0.01, 0.1, 0.001],
	},
	n_jobs=-1,
	cv=kfold,
	scoring=scoring,
	refit='rmse', # refit best scoring model on whole training data with rmse metric
	return_train_score=False, # for better performance
	verbose=2 # print progress
	)

	clf.fit(X_train, y_train)
	print("Best score: {}".format(clf.best_score_))
	print("Best Params: {}".format(clf.best_params_))
	print("\n## Scoring on test set:")
	s = clf.best_estimator_.score(X_test, y_test)
	print(" score=", s)
	if not os.path.exists('data/xgboost-models'):
	os.makedirs('data/xgboost-models')
	pickle.dump(clf, open("data/xgboost-models/{}_boosting.model".format(trait), "wb"))
	else:
	clf = pickle.load(open("data/xgboost-models/{}_boosting.model".format(trait), "rb"))

	def prepare_data(training_data, relative_test_size):
	scores = training_data[[*TRAITS]]
	features = training_data.drop(['userid'] + list(TRAITS), axis=1)

	# split to train-validation and test sets
	features_train, features_test, scores_train, scores_test = train_test_split(features, scores, test_size=relative_test_size, random_state=7)
	return features_train, features_test, scores_train, scores_test


	if __name__ == "__main__":
	data = pd.read_csv("data/training_data.csv", index_col=0)

	relative_test_size = TEST_SIZE
	features_train, features_test, scores_train, scores_test = prepare_data(data, relative_test_size)
	print("Number of training samples", len(features_train))
	print("Number of test samples", len(features_test))

	for trait in TRAITS_TO_TRAIN:
	for feature_set in [trait, 'common', 'union']:
	if feature_set == 'union':
	selected_features_boosting['union'] = set(selected_features_boosting[trait]) \| set(selected_features_boosting['common'])
	selected_features_pearson['union'] = set(selected_features_pearson[trait]) \| set(selected_features_pearson['common'])


	# trait = 'neu'
	feature_names = selected_features_pearson[feature_set]
	name = trait + "_" + feature_set + "_" + "pearson"

	print("######################")
	print("starting next training with pearson:", trait, feature_set)
	print("######################")
	# select features
	X_train = features_train[[*feature_names]]
	X_test = features_test[[*feature_names]]
	y_train = scores_train[trait].values.flatten()
	y_test = scores_test[trait].values.flatten()

	# actual training
	boostwith(X_train, X_test, y_train, y_test, name)
	# ---------------------------------------------------------------------------------
	# trait = 'neu'
	feature_names = selected_features_boosting[feature_set]
	name = trait + "_" + feature_set + "_" + "boosting"

	print("######################")
	print("starting next training with boosting:", trait, feature_set)
	print("######################")
	# select features
	X_train = features_train[[*feature_names]]
	X_test = features_test[[*feature_names]]
	y_train = scores_train[trait].values.flatten()
	y_test = scores_test[trait].values.flatten()

	# actual training
	boostwith(X_train, X_test, y_train, y_test, name)