Skip to content

Instantly share code, notes, and snippets.

@xwjiang2010
Last active September 16, 2021 18:30
Show Gist options
  • Save xwjiang2010/c83dfd14292cd8d318fcb641d28610b4 to your computer and use it in GitHub Desktop.
Save xwjiang2010/c83dfd14292cd8d318fcb641d28610b4 to your computer and use it in GitHub Desktop.
import pandas as pd
import numpy as np
import ray
from ray import tune
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
from tune_sklearn import TuneSearchCV
from xgboost import XGBClassifier
from ray.tune.suggest.bohb import TuneBOHB
train_df = pd.read_csv('~/Downloads/datasets/train.csv', dtype={'id': np.int32, 'target': np.int8})
y = train_df['target'].values
X = train_df.drop(['target', 'id'], axis=1)
features = X.columns.tolist()
cat_car_05 = [c for c in features if 'car_05_cat' in c]
calc_features = [c for c in features if 'calc' in c]
other_features_to_drop = ['ps_ind_14','ps_car_10_cat','ps_car_14','ps_ind_10_bin','ps_ind_11_bin',
'ps_ind_12_bin','ps_ind_13_bin','ps_car_11','ps_car_12']
X.drop(calc_features, axis=1, inplace=True)
X.drop(cat_car_05, axis=1, inplace=True)
X.drop(other_features_to_drop, axis=1, inplace=True)
X['ps_reg_03'].replace(-1, X['ps_reg_03'].median(), inplace=True)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=7)
# learning_rate=0.02, n_estimators=600,
model = XGBClassifier(objective='binary:logistic', nthread=1, eval_metric='auc',)
new_params = {
"learning_rate": tune.choice([0.02, 0.1, 0.5]),
"n_estimators": tune.choice([50, 200, 600]),
"max_depth": tune.randint(lower=1, upper=10),
"min_child_weight": tune.loguniform(lower=0.001, upper=128),
"subsample": tune.uniform(lower=0.1, upper=1.0),
"colsample_bylevel": tune.uniform(lower=0.01, upper=1.0),
"colsample_bytree": tune.uniform(lower=0.01, upper=1.0),
"reg_alpha": tune.loguniform(lower=1 / 1024, upper=10.0),
"reg_lambda": tune.loguniform(lower=1 / 1024, upper=10.0),
"scale_pos_weight": tune.choice([1, 26, 50])
}
run_cv = TuneSearchCV(
model,
param_distributions=new_params,
cv=3,
n_trials=100,
scoring="roc_auc",
early_stopping=True,
search_optimization=TuneBOHB(max_concurrent=40),
verbose=2,
)
# run_cv = GridSearchCV(
# model,
# {
# "learning_rate": [0.5, 1],
# "n_estimators": [50, 100],
# },
# cv=3,
# scoring='roc_auc',
# n_jobs=-1,
# verbose=2
# )
# ray.init("ray://54.218.29.220:10001")
# === remove this ===
ray.init(address="auto")
run_cv.fit(X_train, y_train)
# print(run_cv.cv_results_)
trained_model = run_cv.best_estimator_
y_pred = trained_model.predict_proba(X_test)
roc_auc_score = roc_auc_score(y_test, y_pred[:,1])
print("==============================================================")
print(roc_auc_score)
print("==============================================================")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment