jiahao87/mlflow_full_sample.py

## mlflow_full_sample.py
import os
import numpy as np
from scipy.stats import uniform
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from sklearn import metrics
from sklearn.model_selection import ParameterSampler
from sklearn.ensemble import RandomForestClassifier

import mlflow
import mlflow.sklearn


####################### Amend configurations here ########################

# Credentials
GOOGLE_APPLICATION_CREDENTIALS = <GOOGLE_APPLICATION_CREDENTIALS>  # path to service account json file
MLFLOW_TRACKING_USERNAME = <MLFLOW_TRACKING_USERNAME>  # username
MLFLOW_TRACKING_PASSWORD = <MLFLOW_TRACKING_PASSWORD>  # password

experiment_name = "Experiment 1"  # amend experiment name accordingly
tracking_uri = './mlruns'  # Or external IP e.g., "http://35.225.50.9:80"

# Hyperparameters distribution for our model
hyperparams = {'max_depth':range(5,21),
               'max_samples':uniform(loc=0.5, scale=0.5),
               'max_features': [None, 'sqrt', 'log2']}

# Other fixed parameters
params = {'cv_folds':3,
          'n_iter':6}

# Metrics to score and log
metrics = ['accuracy', 'f1_macro']  # run sorted(metrics.SCORERS.keys()) to see list of metrics avaliable

########################################################################

# Set environment variables
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = GOOGLE_APPLICATION_CREDENTIALS
os.environ['MLFLOW_TRACKING_USERNAME'] = MLFLOW_TRACKING_USERNAME
os.environ['MLFLOW_TRACKING_PASSWORD'] = MLFLOW_TRACKING_PASSWORD


def data_processing():
    iris = load_iris()
    X, y = iris.data, iris.target
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
    return X_train, X_test, y_train, y_test

X_train, X_test, y_train, y_test = data_processing()

# Set experiment name
mlflow.set_experiment(experiment_name)
experiment = mlflow.get_experiment_by_name(experiment_name)

# Set path to log
mlflow.set_tracking_uri(tracking_uri)

param_list = list(ParameterSampler(hyperparams, n_iter=params['n_iter'], random_state=0))

for run in range(params['n_iter']):
    run_hyperparams = param_list[run]

    with mlflow.start_run(experiment_id = experiment.experiment_id):
        clf = RandomForestClassifier(max_depth=run_hyperparams['max_depth'],
                                     max_samples=run_hyperparams['max_samples'],
                                     max_features=run_hyperparams['max_features'],
                                     random_state=0)
        clf.fit(X_train, y_train)

        scores = cross_validate(clf, X_train, y_train,
                                cv = params['cv_folds'],
                                scoring = metrics)

        metrics_dict = {}
        for metric in metrics:
            metrics_dict[metric] = np.mean(scores['test_' + metric])

        #log model params
        mlflow.log_params(run_hyperparams)

        #log model params
        mlflow.log_metrics(metrics_dict)

        # log model
        mlflow.sklearn.log_model(clf, "model")
	import os
	import numpy as np
	from scipy.stats import uniform
	from sklearn.datasets import load_iris
	from sklearn.model_selection import train_test_split
	from sklearn.model_selection import cross_validate
	from sklearn import metrics
	from sklearn.model_selection import ParameterSampler
	from sklearn.ensemble import RandomForestClassifier

	import mlflow
	import mlflow.sklearn


	####################### Amend configurations here ########################

	# Credentials
	GOOGLE_APPLICATION_CREDENTIALS = <GOOGLE_APPLICATION_CREDENTIALS> # path to service account json file
	MLFLOW_TRACKING_USERNAME = <MLFLOW_TRACKING_USERNAME> # username
	MLFLOW_TRACKING_PASSWORD = <MLFLOW_TRACKING_PASSWORD> # password

	experiment_name = "Experiment 1" # amend experiment name accordingly
	tracking_uri = './mlruns' # Or external IP e.g., "http://35.225.50.9:80"

	# Hyperparameters distribution for our model
	hyperparams = {'max_depth':range(5,21),
	'max_samples':uniform(loc=0.5, scale=0.5),
	'max_features': [None, 'sqrt', 'log2']}

	# Other fixed parameters
	params = {'cv_folds':3,
	'n_iter':6}

	# Metrics to score and log
	metrics = ['accuracy', 'f1_macro'] # run sorted(metrics.SCORERS.keys()) to see list of metrics avaliable

	########################################################################

	# Set environment variables
	os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = GOOGLE_APPLICATION_CREDENTIALS
	os.environ['MLFLOW_TRACKING_USERNAME'] = MLFLOW_TRACKING_USERNAME
	os.environ['MLFLOW_TRACKING_PASSWORD'] = MLFLOW_TRACKING_PASSWORD


	def data_processing():
	iris = load_iris()
	X, y = iris.data, iris.target
	X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
	return X_train, X_test, y_train, y_test

	X_train, X_test, y_train, y_test = data_processing()

	# Set experiment name
	mlflow.set_experiment(experiment_name)
	experiment = mlflow.get_experiment_by_name(experiment_name)

	# Set path to log
	mlflow.set_tracking_uri(tracking_uri)

	param_list = list(ParameterSampler(hyperparams, n_iter=params['n_iter'], random_state=0))

	for run in range(params['n_iter']):
	run_hyperparams = param_list[run]

	with mlflow.start_run(experiment_id = experiment.experiment_id):
	clf = RandomForestClassifier(max_depth=run_hyperparams['max_depth'],
	max_samples=run_hyperparams['max_samples'],
	max_features=run_hyperparams['max_features'],
	random_state=0)
	clf.fit(X_train, y_train)

	scores = cross_validate(clf, X_train, y_train,
	cv = params['cv_folds'],
	scoring = metrics)

	metrics_dict = {}
	for metric in metrics:
	metrics_dict[metric] = np.mean(scores['test_' + metric])

	#log model params
	mlflow.log_params(run_hyperparams)

	#log model params
	mlflow.log_metrics(metrics_dict)

	# log model
	mlflow.sklearn.log_model(clf, "model")