Katsumata420/optuna_sklearn_mllfow.py

## optuna_sklearn_mllfow.py
"""
Optuna example that optimizes a classifier configuration for Iris dataset using sklearn.

In this example, we optimize a classifier configuration for Iris dataset. Classifiers are from
scikit-learn. We optimize both the choice of classifier (among SVC and RandomForest) and their
hyperparameters.

"""
import mlflow

import optuna
from optuna.integration.mlflow import MLflowCallback, RUN_ID_ATTRIBUTE_KEY

import sklearn.datasets
import sklearn.ensemble
import sklearn.model_selection
import sklearn.svm


optuna.logging.set_verbosity(optuna.logging.WARNING)

# FYI: Objective functions can take additional arguments
# (https://optuna.readthedocs.io/en/stable/faq.html#objective-func-additional-args).
class OptunaObjective:
    def __init__(self, experiment_id):
        self.experiment_id = experiment_id

    def __call__(self, trial):
        with mlflow.start_run(experiment_id=self.experiment_id, run_name=str(trial.number)) as active_run:
            run_id = active_run.info.run_id
            trial.set_system_attr(RUN_ID_ATTRIBUTE_KEY, run_id)

        iris = sklearn.datasets.load_iris()
        x, y = iris.data, iris.target

        classifier_name = trial.suggest_categorical("classifier", ["SVC", "RandomForest"])
        if classifier_name == "SVC":
            svc_c = trial.suggest_float("svc_c", 1e-10, 1e10, log=True)
            classifier_obj = sklearn.svm.SVC(C=svc_c, gamma="auto")
        else:
            rf_max_depth = trial.suggest_int("rf_max_depth", 2, 32, log=True)
            classifier_obj = sklearn.ensemble.RandomForestClassifier(
                max_depth=rf_max_depth, n_estimators=10
            )

        # https://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter
        metric = {"f": "f1_macro", "accuracy": "accuracy"}
        score = sklearn.model_selection.cross_validate(classifier_obj, x, y, n_jobs=-1, cv=3, scoring=metric)
        score = {k: v.mean() for k, v in score.items()}
        accuracy = score["test_accuracy"]

        with mlflow.start_run(experiment_id=self.experiment_id, run_id=run_id):
            mlflow.log_metrics(score)
        return accuracy


if __name__ == "__main__":
    experiment_id = mlflow.create_experiment(name="optuna-experiment")  # mlflow に保存される experiment-name
    objective = OptunaObjective(experiment_id)
    mlflow_kwargs = {"experiment_id": experiment_id}
    # mlflow に保存される metric-name
    # 下記の MLflowCallback は optuna が v3.0.0 以降でないとだめ
    mlflow_call = MLflowCallback(metric_name="hoge", create_experiment=False, mlflow_kwargs=mlflow_kwargs)
    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=100, callbacks=[mlflow_call])
    print(study.best_trial)
	"""
	Optuna example that optimizes a classifier configuration for Iris dataset using sklearn.

	In this example, we optimize a classifier configuration for Iris dataset. Classifiers are from
	scikit-learn. We optimize both the choice of classifier (among SVC and RandomForest) and their
	hyperparameters.

	"""
	import mlflow

	import optuna
	from optuna.integration.mlflow import MLflowCallback, RUN_ID_ATTRIBUTE_KEY

	import sklearn.datasets
	import sklearn.ensemble
	import sklearn.model_selection
	import sklearn.svm


	optuna.logging.set_verbosity(optuna.logging.WARNING)

	# FYI: Objective functions can take additional arguments
	# (https://optuna.readthedocs.io/en/stable/faq.html#objective-func-additional-args).
	class OptunaObjective:
	def __init__(self, experiment_id):
	self.experiment_id = experiment_id

	def __call__(self, trial):
	with mlflow.start_run(experiment_id=self.experiment_id, run_name=str(trial.number)) as active_run:
	run_id = active_run.info.run_id
	trial.set_system_attr(RUN_ID_ATTRIBUTE_KEY, run_id)

	iris = sklearn.datasets.load_iris()
	x, y = iris.data, iris.target

	classifier_name = trial.suggest_categorical("classifier", ["SVC", "RandomForest"])
	if classifier_name == "SVC":
	svc_c = trial.suggest_float("svc_c", 1e-10, 1e10, log=True)
	classifier_obj = sklearn.svm.SVC(C=svc_c, gamma="auto")
	else:
	rf_max_depth = trial.suggest_int("rf_max_depth", 2, 32, log=True)
	classifier_obj = sklearn.ensemble.RandomForestClassifier(
	max_depth=rf_max_depth, n_estimators=10
	)

	# https://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter
	metric = {"f": "f1_macro", "accuracy": "accuracy"}
	score = sklearn.model_selection.cross_validate(classifier_obj, x, y, n_jobs=-1, cv=3, scoring=metric)
	score = {k: v.mean() for k, v in score.items()}
	accuracy = score["test_accuracy"]

	with mlflow.start_run(experiment_id=self.experiment_id, run_id=run_id):
	mlflow.log_metrics(score)
	return accuracy


	if __name__ == "__main__":
	experiment_id = mlflow.create_experiment(name="optuna-experiment") # mlflow に保存される experiment-name
	objective = OptunaObjective(experiment_id)
	mlflow_kwargs = {"experiment_id": experiment_id}
	# mlflow に保存される metric-name
	# 下記の MLflowCallback は optuna が v3.0.0 以降でないとだめ
	mlflow_call = MLflowCallback(metric_name="hoge", create_experiment=False, mlflow_kwargs=mlflow_kwargs)
	study = optuna.create_study(direction="maximize")
	study.optimize(objective, n_trials=100, callbacks=[mlflow_call])
	print(study.best_trial)