alucard001/numerai.py

## numerai.py
"""
Largely come from TPOT example.  The only control you can do to prevent timeout
and successful running is the "generation" and "population_size" parameters.
Remember, scoring is "log_loss" as of 18 Jan 2017, not probability

The larger the generation and population_size, the longer time you take to get result.
"""

from tpot import TPOTClassifier
from sklearn.model_selection import train_test_split
import pandas as pd

data = pd.read_csv("numerai_training_data.csv")
X, y = data.iloc[:,0:49], data.iloc[:,50]

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.75, test_size=0.25)

tpot = TPOTClassifier(generations=4, population_size=20, verbosity=2, scoring="log_loss")

tpot.fit(X_train, y_train)
print(tpot.score(X_test, y_test))
tpot.export('tpot_numerai_pipeline.py')

## tpot_numerai_pipeline-gen-3.py
"""
When using generation = 3
"""
import numpy as np

from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline, make_union
from sklearn.preprocessing import FunctionTransformer

# NOTE: Make sure that the class is labeled 'class' in the data file
tpot_data = np.recfromcsv('PATH/TO/DATA/FILE', delimiter='COLUMN_SEPARATOR', dtype=np.float64)
features = np.delete(tpot_data.view(np.float64).reshape(tpot_data.size, -1), tpot_data.dtype.names.index('class'), axis=1)
training_features, testing_features, training_classes, testing_classes = \
    train_test_split(features, tpot_data['class'], random_state=42)

exported_pipeline = make_pipeline(
    MultinomialNB(alpha=0.1, fit_prior=True)
)

exported_pipeline.fit(training_features, training_classes)
results = exported_pipeline.predict(testing_features)

## tpot_numerai_pipeline-gen-4.py
"""
When using generation = 4
"""
import numpy as np

from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.pipeline import make_pipeline, make_union
from sklearn.preprocessing import FunctionTransformer, MinMaxScaler

# NOTE: Make sure that the class is labeled 'class' in the data file
tpot_data = np.recfromcsv('PATH/TO/DATA/FILE', delimiter='COLUMN_SEPARATOR', dtype=np.float64)
features = np.delete(tpot_data.view(np.float64).reshape(tpot_data.size, -1), tpot_data.dtype.names.index('class'), axis=1)
training_features, testing_features, training_classes, testing_classes = \
    train_test_split(features, tpot_data['class'], random_state=42)

exported_pipeline = make_pipeline(
    make_union(
        FunctionTransformer(lambda X: X),
        FunctionTransformer(lambda X: X)
    ),
    MinMaxScaler(),
    make_union(VotingClassifier([("est", GaussianNB())]), FunctionTransformer(lambda X: X)),
    MultinomialNB(alpha=0.71, fit_prior=True)
)

exported_pipeline.fit(training_features, training_classes)
results = exported_pipeline.predict(testing_features)
	"""
	Largely come from TPOT example. The only control you can do to prevent timeout
	and successful running is the "generation" and "population_size" parameters.
	Remember, scoring is "log_loss" as of 18 Jan 2017, not probability

	The larger the generation and population_size, the longer time you take to get result.
	"""

	from tpot import TPOTClassifier
	from sklearn.model_selection import train_test_split
	import pandas as pd

	data = pd.read_csv("numerai_training_data.csv")
	X, y = data.iloc[:,0:49], data.iloc[:,50]

	X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.75, test_size=0.25)

	tpot = TPOTClassifier(generations=4, population_size=20, verbosity=2, scoring="log_loss")

	tpot.fit(X_train, y_train)
	print(tpot.score(X_test, y_test))
	tpot.export('tpot_numerai_pipeline.py')
	"""
	When using generation = 3
	"""
	import numpy as np

	from sklearn.ensemble import VotingClassifier
	from sklearn.model_selection import train_test_split
	from sklearn.naive_bayes import MultinomialNB
	from sklearn.pipeline import make_pipeline, make_union
	from sklearn.preprocessing import FunctionTransformer

	# NOTE: Make sure that the class is labeled 'class' in the data file
	tpot_data = np.recfromcsv('PATH/TO/DATA/FILE', delimiter='COLUMN_SEPARATOR', dtype=np.float64)
	features = np.delete(tpot_data.view(np.float64).reshape(tpot_data.size, -1), tpot_data.dtype.names.index('class'), axis=1)
	training_features, testing_features, training_classes, testing_classes = \
	train_test_split(features, tpot_data['class'], random_state=42)

	exported_pipeline = make_pipeline(
	MultinomialNB(alpha=0.1, fit_prior=True)
	)

	exported_pipeline.fit(training_features, training_classes)
	results = exported_pipeline.predict(testing_features)
	"""
	When using generation = 4
	"""
	import numpy as np

	from sklearn.ensemble import VotingClassifier
	from sklearn.model_selection import train_test_split
	from sklearn.naive_bayes import GaussianNB, MultinomialNB
	from sklearn.pipeline import make_pipeline, make_union
	from sklearn.preprocessing import FunctionTransformer, MinMaxScaler

	# NOTE: Make sure that the class is labeled 'class' in the data file
	tpot_data = np.recfromcsv('PATH/TO/DATA/FILE', delimiter='COLUMN_SEPARATOR', dtype=np.float64)
	features = np.delete(tpot_data.view(np.float64).reshape(tpot_data.size, -1), tpot_data.dtype.names.index('class'), axis=1)
	training_features, testing_features, training_classes, testing_classes = \
	train_test_split(features, tpot_data['class'], random_state=42)

	exported_pipeline = make_pipeline(
	make_union(
	FunctionTransformer(lambda X: X),
	FunctionTransformer(lambda X: X)
	),
	MinMaxScaler(),
	make_union(VotingClassifier([("est", GaussianNB())]), FunctionTransformer(lambda X: X)),
	MultinomialNB(alpha=0.71, fit_prior=True)
	)

	exported_pipeline.fit(training_features, training_classes)
	results = exported_pipeline.predict(testing_features)