-
-
Save mmmayo13/060971394cf98914bce96ceb9c58ca87 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import timeit | |
from tpot import TPOTClassifier | |
from sklearn.model_selection import StratifiedKFold | |
from sklearn.model_selection import train_test_split | |
from sklearn.datasets import load_digits, load_iris | |
from sklearn import metrics | |
def main(): | |
"""Run TPOT optimizer""" | |
# define dataset | |
dataset = 'iris' | |
#dataset = 'digits' | |
random_state = 42 | |
train_size = 0.75 | |
test_size = 1.0 - train_size | |
checkpoint_folder = './tpot_checkpoints' | |
output_folder = './tpot_output' | |
search_iters = 3 | |
verbosity = 0 | |
generations = 5 | |
population_size = 50 | |
n_jobs = -1 | |
times = [] | |
best_pipes = [] | |
scores = [] | |
ds = '' | |
# load and split dataset | |
if dataset=='iris': | |
ds = load_iris() | |
elif dataset=='digits': | |
ds = load_digits() | |
X_train, X_test, y_train, y_test = train_test_split(ds.data, | |
ds.target, | |
train_size=train_size, | |
test_size=test_size, | |
random_state=random_state) | |
# define scoring metric and model evaluation method | |
scoring = 'accuracy' | |
cv = ('stratified k-fold cross-validation', | |
StratifiedKFold(n_splits=10, | |
shuffle=True, | |
random_state=random_state)) | |
# define search | |
tpot = TPOTClassifier(cv=cv[1], | |
scoring=scoring, | |
verbosity=verbosity, | |
random_state=random_state, | |
n_jobs=n_jobs, | |
generations=generations, | |
population_size=population_size, | |
periodic_checkpoint_folder=checkpoint_folder) | |
print(f'Optimizing prediction pipeline for the {dataset} dataset with {cv[0]} using the {scoring} scoring metric') | |
# pipeline optimization iterations | |
for i in range(search_iters): | |
print(f'\nPipeline optimization iteration: {i}') | |
start_time = timeit.default_timer() | |
tpot.fit(X_train, y_train) | |
elapsed = timeit.default_timer() - start_time | |
score = tpot.score(X_test, y_test) | |
best_pipes.append(tpot.fitted_pipeline_) | |
tpot.export(f'{output_folder}/tpot_{dataset}_pipeline_{i}.py') | |
print(f'>>> elapsed time: {elapsed} seconds') | |
print(f'>>> pipeline score on test data: {score}') | |
# check if pipelines are the same | |
result = True | |
first_pipe = str(best_pipes[0]) | |
for pipe in best_pipes: | |
if first_pipe != str(pipe): | |
result = False | |
if (result): | |
print("\nAll best pipelines were the same:\n") | |
print(best_pipes[0]) | |
else: | |
print('\nBest pipelines:\n') | |
print(*best_pipes, sep='\n\n') | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment