Skip to content

Instantly share code, notes, and snippets.

@Damian89
Created October 25, 2017 14:55
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Damian89/2a38766666e41a29853784606e810179 to your computer and use it in GitHub Desktop.
Save Damian89/2a38766666e41a29853784606e810179 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
# coding: utf8
""" Example for numer.ai competition """
import math
import os
import sys
import numpy
import pandas
__author__ = "Damian Schwyrz"
__copyright__ = "Copyright 2007, damianschwyrz.de"
__email__ = "mail@damianschwyrz.de"
# Settings
feature_cols_to_keep = []
eras_to_exclude_from_trainingset = []
kept_eras_in_trainingset = []
combine_training_and_val_data_into_one_trainingset = False
save_good_models = True
save_result_file = True
cv_number = 4
test_size_for_train_test_split = 0.25
use_adaboosting = False
# Folders
workfolder = os.getcwd() + "/"
datafolder = workfolder + "data/"
resultsfolder = workfolder + "results/"
modelfolder = workfolder + "models/"
# Perform basic file and folder checks...
if not os.path.exists(modelfolder):
print("Creating models folder...")
os.makedirs(modelfolder)
if not os.path.exists(resultsfolder):
print("Creating results folder...")
os.makedirs(resultsfolder)
# Check if both data files exist
if not os.path.exists(datafolder + "numerai_tournament_data.csv"):
print("Sorry, numerai_tournament_data.csv not found in ./data!")
sys.exit()
if not os.path.exists(datafolder + "numerai_training_data.csv"):
print("Sorry, numerai_training_data.csv not found in ./data!")
sys.exit()
# Its not recommenced to include AND exclude eras at the same time... doesn't make sense...
if len(eras_to_exclude_from_trainingset) > 0 and len(kept_eras_in_trainingset) > 0:
print("Please don't use 'eras_to_exclude_from_trainingset' AND 'kept_eras_in_trainingset' at the same time!")
sys.exit()
# Loading data into ram
tournament_data = pandas.read_csv(datafolder + "numerai_tournament_data.csv")
training_data = pandas.read_csv(datafolder + "numerai_training_data.csv")
validation_data = tournament_data[tournament_data.data_type == 'validation']
# Quick summary
print("Data summary after raw import:")
print("Tournament data:\t{} rows\t\t{} columns".format(tournament_data.shape[0], tournament_data.shape[1]))
print("Training data:\t\t{} rows\t\t{} columns".format(training_data.shape[0], training_data.shape[1]))
print("Validation data:\t{} rows\t\t{} columns".format(validation_data.shape[0], validation_data.shape[1]))
print()
# Columns which are not features
no_feature_cols = ['id', 'era', 'data_type', 'target']
# Get ids and features of tournament data
Ids_tournament = tournament_data.id.values
X_tournament = tournament_data.drop(no_feature_cols, axis=1)
# Sometimes you want combine trainings and validation data into one training set! ;)
# Keep in mind: its not recommended!
if combine_training_and_val_data_into_one_trainingset:
training_data = pandas.concat([training_data, validation_data])
# Some eras are crap? Excluded them from your trainings dataset
if len(eras_to_exclude_from_trainingset) > 0:
excluded_eras = ["era" + str(int(x)) for x in eras_to_exclude_from_trainingset]
mask = training_data.era.isin(excluded_eras)
training_data = training_data[~mask]
# Or the other way around: use only selected eras for training
if len(kept_eras_in_trainingset) > 0:
eras_kept = ["era" + str(int(x)) for x in kept_eras_in_trainingset]
mask = training_data.era.isin(eras_kept)
training_data = training_data[mask]
# Split training data into features and targets
X_training = training_data.drop(no_feature_cols, axis=1)
y_training = training_data.target.values
# Same for validation data + get unique eras
X_validation = validation_data.drop(no_feature_cols, axis=1)
y_validation = validation_data.target.values
eras_validation = validation_data.era.unique()
# Not all features are equally important, maybe it is a good idea to include only specific features
if len(feature_cols_to_keep) > 0:
mask = ["feature{}".format(feature_id) for feature_id in feature_cols_to_keep]
X_tournament = X_tournament[mask]
X_training = X_training[mask]
X_validation = X_validation[mask]
# Maybe it is a good idea to construct you own features? Hint... think about it!
# You need to know what you are doing... this is just a plain simple example where feature 1 and 2 are multiplied and
# returned to the power 2. The returned result is used a new feature called "your_feature".
# WARNING: Don't forget to add you new features also to your validation and tournament data otherwise the fitting will fail
# feature_construct = lambda data: pow((data.feature1 * data.feature2), 2)
# X_training['your_feature'] = feature_construct(X_training)
# X_tournament['your_feature'] = feature_construct(X_tournament)
# X_validation['your_feature'] = feature_construct(X_validation)
print("Current size of training set:")
print("Rows/Data points: {}\t\tColumns/Features: {}".format(X_training.shape[0], X_training.shape[1]))
print()
# From here we need numpy arrays not the complex dataframe structures pandas is creating
X_training = X_training.values
X_tournament = X_tournament.values
X_validation = X_validation.values
# Use a simple data preprocessor
# You could also use a sklearn pipeline combining multiple preprocessors including for example principle component
# analysis or any kind of kernel approximation...
from sklearn import preprocessing, pipeline
preprocessor = pipeline.Pipeline(
[
('ss', preprocessing.StandardScaler()),
# ('pca', decomposition.PCA(n_components=15))
]
)
preprocessor.fit(X_training)
X_training = preprocessor.transform(X_training)
X_tournament = preprocessor.transform(X_tournament)
X_validation = preprocessor.transform(X_validation)
# Since we have validation data to check our model on, we don't need training/test split. But sometimes this
# may be a good idea to do!
from sklearn import model_selection
X_train, X_test, y_train, y_test = model_selection.train_test_split(
X_training,
y_training,
test_size=test_size_for_train_test_split,
random_state=42,
)
# Now the simple "machine learning" part... the model, we are using a simple tree classifier
# Take a look at http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.ExtraTreesClassifier.html
from sklearn import ensemble
model = ensemble.ExtraTreesClassifier(
n_estimators=10,
max_depth=3,
random_state=42,
n_jobs=-1,
)
model.fit(X_train, y_train) # <-- training of model happens here
# This is pretty important for our probability in the next steps. So keep this in mind!
print("Class/Target order:")
print(model.classes_)
print()
# Some estimators are able to calculate the importance of features after fitting. This is very simple way to get the
# same results while using a smaller amount of features (the more features you have, the slower training can be).
# For the next iteration you could add the generated list to the "feature_cols_to_keep"-setting above.
if not len(feature_cols_to_keep) > 0:
print(model.feature_importances_)
importance_average = numpy.mean(model.feature_importances_)
print("Features with an importance above the current mean importance ({:.6f}):".format(importance_average))
above_average_important_features = [(i + 1) for (i, importance) in enumerate(model.feature_importances_) if
importance >= importance_average]
print(above_average_important_features)
print()
# Lets take a look at the performance using the models internal score method
score_test_data = model.score(X_test, y_test)
score_val_data = model.score(X_validation, y_validation)
print("Model: Score for {:.2f}% of training data:\t\t{:.6f}".format(
test_size_for_train_test_split * 100,
score_test_data
))
print("Model: Score for nmr's own validation data :\t\t{:.6f}".format(score_val_data))
if cv_number > 0:
# Lets check out what cross_validation says:
scores_test_data_cv = model_selection.cross_val_score(model, X_training, y_training, cv=cv_number, n_jobs=-1)
scores_val_data_cv = model_selection.cross_val_score(model, X_validation, y_validation, cv=cv_number, n_jobs=-1)
print("CV ({}): Score for 100% of training data:\t\t{:.6f} (+-{:.6f})".format(
cv_number,
test_size_for_train_test_split * 100,
scores_test_data_cv.mean(),
scores_test_data_cv.std() / math.sqrt(cv_number)
))
print("CV ({}): Score for nmr's own validation data :\t{:.6f} (+-{:.6f})".format(
cv_number,
scores_val_data_cv.mean(),
scores_val_data_cv.std() / math.sqrt(cv_number)
))
# What about the log loss?
from sklearn import metrics
probability_test_data = model.predict_proba(X_test)
probability_val_data = model.predict_proba(X_validation)
logloss_test_data = metrics.log_loss(y_test, probability_test_data)
logloss_val_data = metrics.log_loss(y_validation, probability_val_data)
print("Logloss for {:.2f}% training data:\t\t\t\t{:.6f}".format(
test_size_for_train_test_split * 100,
logloss_test_data
))
print("Logloss for validation data:\t\t\t\t\t\t{:.6f}".format(logloss_val_data))
print()
if use_adaboosting:
print("AdaboostClassifier activated as meta-classifier, refitting...")
ada_model = ensemble.AdaBoostClassifier(
base_estimator=ensemble.ExtraTreesClassifier(
n_estimators=10,
max_depth=3,
random_state=42,
n_jobs=-1,
),
n_estimators=75,
learning_rate=0.1,
random_state=42,
)
ada_model.fit(X_train, y_train)
# Lets get basic performance metrics for boosted model
boosted_score_test_data = ada_model.score(X_test, y_test)
boosted_score_val_data = ada_model.score(X_validation, y_validation)
print("[Boosted] Model: Score on 33% of training data:\t\t\t{:.6f}".format(boosted_score_test_data))
print("[Boosted] Model: Score on nmr's own validation data :\t{:.6f}".format(boosted_score_val_data))
boosted_probability_test_data = ada_model.predict_proba(X_test)
boosted_probability_val_data = ada_model.predict_proba(X_validation)
boosted_logloss_test_data = metrics.log_loss(y_test, boosted_probability_test_data)
boosted_logloss_val_data = metrics.log_loss(y_validation, boosted_probability_val_data)
print("[Boosted] Logloss on training data:\t\t\t\t\t\t{:.6f}".format(boosted_logloss_test_data))
print("[Boosted] Logloss on validation data:\t\t\t\t\t{:.6f}".format(boosted_logloss_val_data))
print()
# Lets calculate improvements
print("Improvement:")
improvement_test_score = (score_test_data - boosted_score_test_data) / score_test_data * 100
print("[Testdata] Score (base classifier vs adaboosted classifier):\t\t\t{:.6f}%".format(
improvement_test_score
))
improvement_test_logloss = (logloss_test_data - boosted_logloss_test_data) / logloss_test_data * 100
print("[Testdata] Logloss (base classifier vs adaboosted classifier):\t\t\t{:.2f}%".format(
improvement_test_logloss
))
improvement_val_score = (score_val_data - boosted_score_val_data) / score_val_data * 100
print("[Validationdata] Score (base classifier vs adaboosted classifier):\t\t{:.6f}%".format(
improvement_val_score
))
improvement_val_logloss = (logloss_val_data - boosted_logloss_val_data) / logloss_val_data * 100
print("[Validationdata] Logloss (base classifier vs adaboosted classifier):\t{:.2f}%".format(
improvement_val_logloss
))
# If adaboost improved our scores, lets make it the base classifier and the one to use for writing results and
# the one to be saved for later use.
if improvement_val_logloss > 0 and improvement_val_logloss > 0:
model = ada_model
print()
# Lets use our model to predict every target within the tournament data, the format numer.ai expects the results to be is:
# Id, probability (of target being 1)
probability_tournament_data = model.predict_proba(X_tournament)
# Most sklearn predict_proba methods return lists for every feature row containing the probability for every class! We
# are interested in the probability of being "1". model.classes_ shows us which element number we have to select for
# this probability. In our case it is the second element of every subarray (we start counting at 0!! ;)
# I'm pretty sure you will have to check this point on your own to get it fully!
probability_for_tournament_data_of_being_1 = probability_tournament_data[:, 1]
numer_ai_result = pandas.DataFrame(
{
'id': Ids_tournament,
'probability': probability_for_tournament_data_of_being_1,
}
)
# In the case of numerai competition also the "consistency" is important. You can find its implementation in numer.ais github:
# https://github.com/numerai/submission-criteria/blob/820d0f939ae2892f6bdeee02d855ffc0e80958de/database_manager.py#L83
# Lets copy this part!
better_than_random_era_count = 0
for era in eras_validation:
era_data = validation_data[validation_data.era == era]
submission_era_data = numer_ai_result[numer_ai_result.id.isin(era_data.id.values)]
era_data = era_data.sort_values(["id"])
submission_era_data = submission_era_data.sort_values(["id"])
logloss = metrics.log_loss(era_data.target.values, submission_era_data.probability.values)
if logloss < -math.log(0.5):
better_than_random_era_count += 1
consistency = better_than_random_era_count / len(eras_validation) * 100
print("Calculated consistency: {:.2f}%".format(consistency))
print()
# You could implement also the originality and concordance metric ;)
# For now we skip those steps
# If our logloss is below ~0,631 or -math.log(0.5) AND consistency is greater than 75% we could submit our resultfile!
# Lets save/create our resultfile if this conditions are true, we are using the logloss based on numerai's validation data.
if logloss_val_data > -math.log(0.5):
print("Sorry, logloss is only {:.6f}, thats bigger than {:.6f}".format(logloss_val_data, -math.log(0.5)))
sys.exit()
if consistency < 75:
print("Sorry, consistency is only {:.2f}%, thats smaller than 75%".format(consistency))
sys.exit()
file_base_name = "sklearn-{:.6f}-{:.3f}-{:.2f}".format(logloss_val_data, score_val_data, consistency)
if save_result_file:
print("Wrote resultfile.")
numer_ai_result.to_csv(
resultsfolder + file_base_name + ".csv",
index=False
)
# Since we found our super good model, its a good idea to save it for future use (every week starts a new challenge!).
if save_good_models:
import pickle
print("Saved model.")
with open(modelfolder + file_base_name + ".model", 'wb') as model_file:
pickle.dump(model, model_file)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment