Skip to content

Instantly share code, notes, and snippets.

@alucard001
Last active January 17, 2018 01:15
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save alucard001/97b55f7361cb0089b9466e57e99d5737 to your computer and use it in GitHub Desktop.
Save alucard001/97b55f7361cb0089b9466e57e99d5737 to your computer and use it in GitHub Desktop.
Using TPOT Classifier to analysis Numerai dataset
"""
Largely come from TPOT example. The only control you can do to prevent timeout
and successful running is the "generation" and "population_size" parameters.
Remember, scoring is "log_loss" as of 18 Jan 2017, not probability
The larger the generation and population_size, the longer time you take to get result.
"""
from tpot import TPOTClassifier
from sklearn.model_selection import train_test_split
import pandas as pd
data = pd.read_csv("numerai_training_data.csv")
X, y = data.iloc[:,0:49], data.iloc[:,50]
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.75, test_size=0.25)
tpot = TPOTClassifier(generations=4, population_size=20, verbosity=2, scoring="log_loss")
tpot.fit(X_train, y_train)
print(tpot.score(X_test, y_test))
tpot.export('tpot_numerai_pipeline.py')
"""
When using generation = 3
"""
import numpy as np
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline, make_union
from sklearn.preprocessing import FunctionTransformer
# NOTE: Make sure that the class is labeled 'class' in the data file
tpot_data = np.recfromcsv('PATH/TO/DATA/FILE', delimiter='COLUMN_SEPARATOR', dtype=np.float64)
features = np.delete(tpot_data.view(np.float64).reshape(tpot_data.size, -1), tpot_data.dtype.names.index('class'), axis=1)
training_features, testing_features, training_classes, testing_classes = \
train_test_split(features, tpot_data['class'], random_state=42)
exported_pipeline = make_pipeline(
MultinomialNB(alpha=0.1, fit_prior=True)
)
exported_pipeline.fit(training_features, training_classes)
results = exported_pipeline.predict(testing_features)
"""
When using generation = 4
"""
import numpy as np
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.pipeline import make_pipeline, make_union
from sklearn.preprocessing import FunctionTransformer, MinMaxScaler
# NOTE: Make sure that the class is labeled 'class' in the data file
tpot_data = np.recfromcsv('PATH/TO/DATA/FILE', delimiter='COLUMN_SEPARATOR', dtype=np.float64)
features = np.delete(tpot_data.view(np.float64).reshape(tpot_data.size, -1), tpot_data.dtype.names.index('class'), axis=1)
training_features, testing_features, training_classes, testing_classes = \
train_test_split(features, tpot_data['class'], random_state=42)
exported_pipeline = make_pipeline(
make_union(
FunctionTransformer(lambda X: X),
FunctionTransformer(lambda X: X)
),
MinMaxScaler(),
make_union(VotingClassifier([("est", GaussianNB())]), FunctionTransformer(lambda X: X)),
MultinomialNB(alpha=0.71, fit_prior=True)
)
exported_pipeline.fit(training_features, training_classes)
results = exported_pipeline.predict(testing_features)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment