Skip to content

Instantly share code, notes, and snippets.

@shamilnabiyev
Created October 31, 2022 18:52
Show Gist options
  • Save shamilnabiyev/c1b6e23444a949bc45243eab445a3806 to your computer and use it in GitHub Desktop.
Save shamilnabiyev/c1b6e23444a949bc45243eab445a3806 to your computer and use it in GitHub Desktop.
Hyperparameter optimization for Random Forest Classifier using the Optuna lib
import optuna
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import accuracy_score

optuna.logging.set_verbosity(optuna.logging.WARNING)


X, y = make_classification(
    n_samples=250, 
    n_features=10,
    n_informative=5, 
    n_redundant=3,
    random_state=42, 
    shuffle=True
)

X_train, X_test, y_train, y_test = train_test_split(
    X, 
    y, 
    test_size=0.20, 
    random_state=42
)

print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

def objective(trial):
    # Number of trees in random forest
    n_estimators = trial.suggest_int(name="n_estimators", low=100, high=500, step=100)

    # Number of features to consider at every split
    max_features = trial.suggest_categorical(name="max_features", choices=['auto', 'sqrt']) 

    # Maximum number of levels in tree
    max_depth = trial.suggest_int(name="max_depth", low=10, high=110, step=20)

    # Minimum number of samples required to split a node
    min_samples_split = trial.suggest_int(name="min_samples_split", low=2, high=10, step=2)

    # Minimum number of samples required at each leaf node
    min_samples_leaf = trial.suggest_int(name="min_samples_leaf", low=1, high=4, step=1)
    
    params = {
        "n_estimators": n_estimators,
        "max_features": max_features,
        "max_depth": max_depth,
        "min_samples_split": min_samples_split,
        "min_samples_leaf": min_samples_leaf
    }
    model = RandomForestClassifier(random_state=SEED, **params)
    
    cv_score = cross_val_score(model, X_train, y_train, n_jobs=4, cv=5)
    mean_cv_accuracy = cv_score.mean()
    return mean_cv_accuracy

study = optuna.create_study()
study.optimize(objective, n_trials=5)

# Train a new model using the best parameters
best_model = RandomForestClassifier(random_state=SEED, **study.best_params)
best_model.fit(X_train, y_train)

y_pred = best_model.predict(X_test)

test_acc = accuracy_score(y_test, y_pred)

test_precision, test_recall, test_f1, _ = precision_recall_fscore_support(
    y_test, 
    y_pred, 
    average='binary'
)

print("test_accuracy:", test_acc)
print("test_precision:", test_precision)
print("test_recall:", test_recall)
print("test_f1_score:", test_f1)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment