Skip to content

Instantly share code, notes, and snippets.

@jeethu
Created April 23, 2020 19:20
Show Gist options
  • Save jeethu/c2d7810e080dbfce04d0001e0e0efb78 to your computer and use it in GitHub Desktop.
Save jeethu/c2d7810e080dbfce04d0001e0e0efb78 to your computer and use it in GitHub Desktop.
numerai example_model with int8 features
#!/usr/bin/env python
"""
Example classifier on Numerai data using a xgboost regression.
To get started, install the required packages: pip install pandas numpy sklearn xgboost
"""
import csv
import pandas as pd
import numpy as np
from xgboost import XGBRegressor
TOURNAMENT_NAME = "kazutsugi"
TARGET_NAME = f"target_{TOURNAMENT_NAME}"
PREDICTION_NAME = f"prediction_{TOURNAMENT_NAME}"
BENCHMARK = 0
BAND = 0.2
# Submissions are scored by spearman correlation
def score(df):
# method="first" breaks ties based on order in array
return np.corrcoef(
df[TARGET_NAME],
df[PREDICTION_NAME].rank(pct=True, method="first")
)[0, 1]
# The payout function
def payout(scores):
return ((scores - BENCHMARK) / BAND).clip(lower=-1, upper=1)
# Read the csv file into a pandas Dataframe
# Also, rescale features to int8 and targets to fp16 (saves memory)
def read_csv(file_path):
to_uint8 = lambda x: np.uint8(float(x) * 4)
with open(file_path) as f:
column_names = next(csv.reader(f))
dtypes = {TOURNAMENT_NAME: np.float16}
converters = {x: to_uint8 for x in column_names if x.startswith('feature')}
return pd.read_csv(file_path, dtype=dtypes, converters=converters).set_index("id")
def main():
print("# Loading data...")
# The training data is used to train your model how to predict the targets.
training_data = read_csv("numerai_training_data.csv")
# The tournament data is the data that Numerai uses to evaluate your model.
tournament_data = read_csv("numerai_tournament_data.csv")
feature_names = [f for f in training_data.columns if f.startswith("feature")]
print(f"Loaded {len(feature_names)} features")
print("Training model")
# For faster experimentation you can decrease n_estimators to 200, for better performance increase to 20,000
model = XGBRegressor(max_depth=5, learning_rate=0.01, n_estimators=2000, n_jobs=-1, colsample_bytree=0.1)
model.fit(training_data[feature_names], training_data[TARGET_NAME])
print("Generating predictions")
training_data[PREDICTION_NAME] = model.predict(training_data[feature_names])
tournament_data[PREDICTION_NAME] = model.predict(tournament_data[feature_names])
# Check the per-era correlations on the training set
train_correlations = training_data.groupby("era").apply(score)
print(f"On training the correlation has mean {train_correlations.mean()} and std {train_correlations.std()}")
print(f"On training the average per-era payout is {payout(train_correlations).mean()}")
# Check the per-era correlations on the validation set
validation_data = tournament_data[tournament_data.data_type == "validation"]
validation_correlations = validation_data.groupby("era").apply(score)
print(
f"On validation the correlation has mean {validation_correlations.mean()} and std {validation_correlations.std()}")
print(f"On validation the average per-era payout is {payout(validation_correlations).mean()}")
tournament_data[PREDICTION_NAME].to_csv(TOURNAMENT_NAME + "_submission.csv")
# Now you can upload these predictions on https://numer.ai
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment