Skip to content

Instantly share code, notes, and snippets.

@mmmayo13
Last active May 12, 2021 12:16
Show Gist options
  • Save mmmayo13/060971394cf98914bce96ceb9c58ca87 to your computer and use it in GitHub Desktop.
Save mmmayo13/060971394cf98914bce96ceb9c58ca87 to your computer and use it in GitHub Desktop.
import timeit
from tpot import TPOTClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_digits, load_iris
from sklearn import metrics
def main():
"""Run TPOT optimizer"""
# define dataset
dataset = 'iris'
#dataset = 'digits'
random_state = 42
train_size = 0.75
test_size = 1.0 - train_size
checkpoint_folder = './tpot_checkpoints'
output_folder = './tpot_output'
search_iters = 3
verbosity = 0
generations = 5
population_size = 50
n_jobs = -1
times = []
best_pipes = []
scores = []
ds = ''
# load and split dataset
if dataset=='iris':
ds = load_iris()
elif dataset=='digits':
ds = load_digits()
X_train, X_test, y_train, y_test = train_test_split(ds.data,
ds.target,
train_size=train_size,
test_size=test_size,
random_state=random_state)
# define scoring metric and model evaluation method
scoring = 'accuracy'
cv = ('stratified k-fold cross-validation',
StratifiedKFold(n_splits=10,
shuffle=True,
random_state=random_state))
# define search
tpot = TPOTClassifier(cv=cv[1],
scoring=scoring,
verbosity=verbosity,
random_state=random_state,
n_jobs=n_jobs,
generations=generations,
population_size=population_size,
periodic_checkpoint_folder=checkpoint_folder)
print(f'Optimizing prediction pipeline for the {dataset} dataset with {cv[0]} using the {scoring} scoring metric')
# pipeline optimization iterations
for i in range(search_iters):
print(f'\nPipeline optimization iteration: {i}')
start_time = timeit.default_timer()
tpot.fit(X_train, y_train)
elapsed = timeit.default_timer() - start_time
score = tpot.score(X_test, y_test)
best_pipes.append(tpot.fitted_pipeline_)
tpot.export(f'{output_folder}/tpot_{dataset}_pipeline_{i}.py')
print(f'>>> elapsed time: {elapsed} seconds')
print(f'>>> pipeline score on test data: {score}')
# check if pipelines are the same
result = True
first_pipe = str(best_pipes[0])
for pipe in best_pipes:
if first_pipe != str(pipe):
result = False
if (result):
print("\nAll best pipelines were the same:\n")
print(best_pipes[0])
else:
print('\nBest pipelines:\n')
print(*best_pipes, sep='\n\n')
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment