jeethu/example_model.py

## example_model.py
#!/usr/bin/env python
"""
Example classifier on Numerai data using a xgboost regression.
To get started, install the required packages: pip install pandas numpy sklearn xgboost
"""

import csv

import pandas as pd
import numpy as np
from xgboost import XGBRegressor

TOURNAMENT_NAME = "kazutsugi"
TARGET_NAME = f"target_{TOURNAMENT_NAME}"
PREDICTION_NAME = f"prediction_{TOURNAMENT_NAME}"

BENCHMARK = 0
BAND = 0.2


# Submissions are scored by spearman correlation
def score(df):
    # method="first" breaks ties based on order in array
    return np.corrcoef(
        df[TARGET_NAME],
        df[PREDICTION_NAME].rank(pct=True, method="first")
    )[0, 1]


# The payout function
def payout(scores):
    return ((scores - BENCHMARK) / BAND).clip(lower=-1, upper=1)


# Read the csv file into a pandas Dataframe
# Also, rescale features to int8 and targets to fp16 (saves memory)
def read_csv(file_path):
    to_uint8 = lambda x: np.uint8(float(x) * 4)
    with open(file_path) as f:
        column_names = next(csv.reader(f))
    dtypes = {TOURNAMENT_NAME: np.float16}
    converters = {x: to_uint8 for x in column_names if x.startswith('feature')}
    return pd.read_csv(file_path, dtype=dtypes, converters=converters).set_index("id")


def main():
    print("# Loading data...")
    # The training data is used to train your model how to predict the targets.
    training_data = read_csv("numerai_training_data.csv")
    # The tournament data is the data that Numerai uses to evaluate your model.
    tournament_data = read_csv("numerai_tournament_data.csv")

    feature_names = [f for f in training_data.columns if f.startswith("feature")]
    print(f"Loaded {len(feature_names)} features")

    print("Training model")
    # For faster experimentation you can decrease n_estimators to 200, for better performance increase to 20,000
    model = XGBRegressor(max_depth=5, learning_rate=0.01, n_estimators=2000, n_jobs=-1, colsample_bytree=0.1)
    model.fit(training_data[feature_names], training_data[TARGET_NAME])

    print("Generating predictions")
    training_data[PREDICTION_NAME] = model.predict(training_data[feature_names])
    tournament_data[PREDICTION_NAME] = model.predict(tournament_data[feature_names])

    # Check the per-era correlations on the training set
    train_correlations = training_data.groupby("era").apply(score)
    print(f"On training the correlation has mean {train_correlations.mean()} and std {train_correlations.std()}")
    print(f"On training the average per-era payout is {payout(train_correlations).mean()}")

    # Check the per-era correlations on the validation set
    validation_data = tournament_data[tournament_data.data_type == "validation"]
    validation_correlations = validation_data.groupby("era").apply(score)
    print(
        f"On validation the correlation has mean {validation_correlations.mean()} and std {validation_correlations.std()}")
    print(f"On validation the average per-era payout is {payout(validation_correlations).mean()}")

    tournament_data[PREDICTION_NAME].to_csv(TOURNAMENT_NAME + "_submission.csv")
    # Now you can upload these predictions on https://numer.ai


if __name__ == '__main__':
    main()
	#!/usr/bin/env python
	"""
	Example classifier on Numerai data using a xgboost regression.
	To get started, install the required packages: pip install pandas numpy sklearn xgboost
	"""

	import csv

	import pandas as pd
	import numpy as np
	from xgboost import XGBRegressor

	TOURNAMENT_NAME = "kazutsugi"
	TARGET_NAME = f"target_{TOURNAMENT_NAME}"
	PREDICTION_NAME = f"prediction_{TOURNAMENT_NAME}"

	BENCHMARK = 0
	BAND = 0.2


	# Submissions are scored by spearman correlation
	def score(df):
	# method="first" breaks ties based on order in array
	return np.corrcoef(
	df[TARGET_NAME],
	df[PREDICTION_NAME].rank(pct=True, method="first")
	)[0, 1]


	# The payout function
	def payout(scores):
	return ((scores - BENCHMARK) / BAND).clip(lower=-1, upper=1)


	# Read the csv file into a pandas Dataframe
	# Also, rescale features to int8 and targets to fp16 (saves memory)
	def read_csv(file_path):
	to_uint8 = lambda x: np.uint8(float(x) * 4)
	with open(file_path) as f:
	column_names = next(csv.reader(f))
	dtypes = {TOURNAMENT_NAME: np.float16}
	converters = {x: to_uint8 for x in column_names if x.startswith('feature')}
	return pd.read_csv(file_path, dtype=dtypes, converters=converters).set_index("id")


	def main():
	print("# Loading data...")
	# The training data is used to train your model how to predict the targets.
	training_data = read_csv("numerai_training_data.csv")
	# The tournament data is the data that Numerai uses to evaluate your model.
	tournament_data = read_csv("numerai_tournament_data.csv")

	feature_names = [f for f in training_data.columns if f.startswith("feature")]
	print(f"Loaded {len(feature_names)} features")

	print("Training model")
	# For faster experimentation you can decrease n_estimators to 200, for better performance increase to 20,000
	model = XGBRegressor(max_depth=5, learning_rate=0.01, n_estimators=2000, n_jobs=-1, colsample_bytree=0.1)
	model.fit(training_data[feature_names], training_data[TARGET_NAME])

	print("Generating predictions")
	training_data[PREDICTION_NAME] = model.predict(training_data[feature_names])
	tournament_data[PREDICTION_NAME] = model.predict(tournament_data[feature_names])

	# Check the per-era correlations on the training set
	train_correlations = training_data.groupby("era").apply(score)
	print(f"On training the correlation has mean {train_correlations.mean()} and std {train_correlations.std()}")
	print(f"On training the average per-era payout is {payout(train_correlations).mean()}")

	# Check the per-era correlations on the validation set
	validation_data = tournament_data[tournament_data.data_type == "validation"]
	validation_correlations = validation_data.groupby("era").apply(score)
	print(
	f"On validation the correlation has mean {validation_correlations.mean()} and std {validation_correlations.std()}")
	print(f"On validation the average per-era payout is {payout(validation_correlations).mean()}")

	tournament_data[PREDICTION_NAME].to_csv(TOURNAMENT_NAME + "_submission.csv")
	# Now you can upload these predictions on https://numer.ai


	if __name__ == '__main__':
	main()