Skip to content

Instantly share code, notes, and snippets.

@yanboyang713
Created June 12, 2024 11:27
Show Gist options
  • Save yanboyang713/cfce9908ea7acce6566598817dc56308 to your computer and use it in GitHub Desktop.
Save yanboyang713/cfce9908ea7acce6566598817dc56308 to your computer and use it in GitHub Desktop.
def objective(trial):
with mlflow.start_run(nested=True):
print ("start trial")
# Suggest hyperparameters
C = trial.suggest_float('C', 0.1, 10.0)
epsilon = trial.suggest_float('epsilon', 0.01, 1.0)
kernel = trial.suggest_categorical('kernel', ['linear', 'poly', 'rbf', 'sigmoid'])
# Split the data into features and target
X = data[['HTTP_reply_code', 'Bytes']]
y = data[['HTTP_reply_code', 'Bytes']]
# Split the data into training and testing sets
n_test_obs = 20
X_train, X_test = X[:-n_test_obs], X[-n_test_obs:]
y_train, y_test = y[:-n_test_obs], y[-n_test_obs:]
print ("done Split the data into training and testing sets")
# Train the Multioutput SVM model with suggested hyperparameters
svr = SVR(C=C, epsilon=epsilon, kernel=kernel)
model = MultiOutputRegressor(svr)
fitted_model = model.fit(X_train, y_train)
# Get size of the fitted_model object
fitted_model_size = asizeof.asizeof(fitted_model)
print ("done Train the Multioutput SVM model with suggested hyperparameters")
#print(f"Size of fitted_model: {fitted_model_size} bytes")
mlflow.log_metric("fitted_model_size", fitted_model_size)
mlflow.sklearn.log_model(model, "multioutput_svm_model")
# Make predictions
y_pred = model.predict(X_test)
# Calculate the Mean Absolute Error (MAE) for each target and take the average
mae_HTTP_reply_code = mean_absolute_error(y_test['HTTP_reply_code'], y_pred[:, 0])
mae_Bytes = mean_absolute_error(y_test['Bytes'], y_pred[:, 1])
mlflow.log_metric("mae_http_reply_code", mae_HTTP_reply_code)
mlflow.log_metric("mae_bytes", mae_Bytes)
mae_sum = mae_HTTP_reply_code + mae_Bytes
return mae_sum
best_param = ""
def champion_callback(study, frozen_trial):
"""
Logging callback that will report when a new trial iteration improves upon existing
best trial values.
Note: This callback is not intended for use in distributed computing systems such as Spark
or Ray due to the micro-batch iterative implementation for distributing trials to a cluster's
workers or agents.
The race conditions with file system state management for distributed trials will render
inconsistent values with this callback.
"""
winner = study.user_attrs.get("winner", None)
if study.best_value and winner != study.best_value:
study.set_user_attr("winner", study.best_value)
if winner:
improvement_percent = (abs(winner - study.best_value) / study.best_value) * 100
print(
f"Trial {frozen_trial.number} achieved value: {frozen_trial.value} with "
f"{improvement_percent: .4f}% improvement"
)
else:
print(f"Initial trial {frozen_trial.number} achieved value: {frozen_trial.value}")
def run_Multioutput_Regression_SVM_Hyperparameters(data, model_family, dataset_name, experiment_id):
mlflow.end_run()
# Explicitly name runs
today = dt.datetime.now()
run_name = model_family + " " + str(today) + " " + dataset_name
# Create an instance of a PandasDataset
dataset = mlflow.data.from_pandas(
data, name=dataset_name
)
mlflow.enable_system_metrics_logging()
mlflow.system_metrics.set_system_metrics_sampling_interval(1)
with mlflow.start_run(run_name=run_name, experiment_id=experiment_id, log_system_metrics=True, nested=True) as run:
# Turn autolog on to save model artifacts, requirements, etc.
mlflow.autolog(log_models=True)
# Run the Optuna optimization
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=20, callbacks=[champion_callback], timeout=120, catch=(TimeoutError,))
# Best hyperparameters
mlflow.log_params(study.best_params)
mlflow.log_metric("best_mse", study.best_value)
mlflow.log_input(dataset, context="training")
# Log tags
mlflow.set_tags(
tags={
"project": "p4 workflow",
"optimizer_engine": "optuna",
"model_family": model_family,
"dataset": dataset_name,
"with_the_Best_Hyperparameters": "false",
}
)
print("run id: ", run.info.run_id)
print ("best_params: ", study.best_params)
best_param = study.best_params
return study
model_family = "Multioutput_Regression_SVM"
dataset_name = "calgary_one_day"
study = run_Multioutput_Regression_SVM_Hyperparameters(data, model_family, dataset_name)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment