Last active
March 13, 2023 11:24
-
-
Save masafumimori/29489a460e4534949a4ff3078eb6cdd4 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import lightgbm as lgb | |
from lightgbm import cv | |
from scipy import stats | |
class AdversarialValidation: | |
""" | |
Easy to use and fast adversarial validation using LightGBM. | |
""" | |
DEFAULT_PARAMS = {'learning_rate': 0.01, | |
'objective': 'binary', | |
'metric': 'roc', | |
'boosting': 'gbdt', | |
'verbosity': 0, | |
'n_jobs': -1, | |
'force_col_wise': True} | |
def __init__(self) -> None: | |
pass | |
def validate(self, X_train, X_test, params=DEFAULT_PARAMS): | |
"""Performs adversarial validation and prints the result. | |
Args: | |
X_train (DataFrame): Training dataset | |
X_test (DataFrame): Test dataset | |
params (Dictionary, optional): Parameters for when running cross validation with LightGBM model. Defaults to DEFAULT_PARAMS. | |
""" | |
X_train["Target"] = 0 | |
X_test["Target"] = 1 | |
all_data = pd.concat([X_train, X_test], axis=0, ignore_index=True) | |
all_data = all_data.dropna() | |
features = list(all_data.columns[~all_data.columns.isin(['Target'])]) | |
X = all_data.loc[:, features] | |
y = all_data["Target"] | |
LGBdata = lgb.Dataset(X, y, feature_name=features) | |
# perform cross validation with LightGBM | |
cross_val_results = cv(train_set=LGBdata, params=params, | |
feature_name=features, | |
nfold=5, metrics="auc", | |
num_boost_round=100, | |
early_stopping_rounds=20) | |
# print out the final result | |
print("AUC Mean", (cross_val_results["auc-mean"])[-1]) | |
print("AUC Std", (cross_val_results["auc-stdv"])[-1]) | |
def get_stats(self, X_train, X_test): | |
"""Prints score of Kolmogorov-Smirnov test for each feature | |
Args: | |
X_train (DataFrame): Training dataset | |
X_test (DataFrame): Test dataset | |
""" | |
features_list = X_train.columns.values.tolist() | |
for feature in features_list: | |
statistic, p_value = stats.kstest(X_train[feature], X_test[feature]) | |
print("KS test value: %.3f" % statistic, "with p-value %.2f" % | |
p_value, "for ", feature) | |
def list_significant_features(self, X_train, X_test, stat_base=0.3,pval_base=0.05 ): | |
"""Prints scores of Kolmogorov-Smirnov test for only features that have siginificant value | |
Args: | |
X_train (DataFrame): Training dataset | |
X_test (DataFrame): Test dataset | |
stat_base (float, optional): Statistic baseline of Kolmogorov-Smirnov test. Defaults to 0.3. | |
pval_base (float, optional): p value baseline. Defaults to 0.05. | |
""" | |
features_list = X_train.columns.values.tolist() | |
for feature in features_list: | |
statistic, p_value = stats.kstest(X_train[feature], X_test[feature]) | |
if statistic > stat_base and p_value < pval_base: | |
print("KS test value: %.3f" % statistic, "with a p-value %.2f" % | |
p_value, "for ", feature) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment