Skip to content

Instantly share code, notes, and snippets.

@masafumimori
Last active March 13, 2023 11:24
Show Gist options
  • Save masafumimori/29489a460e4534949a4ff3078eb6cdd4 to your computer and use it in GitHub Desktop.
Save masafumimori/29489a460e4534949a4ff3078eb6cdd4 to your computer and use it in GitHub Desktop.
import pandas as pd
import lightgbm as lgb
from lightgbm import cv
from scipy import stats
class AdversarialValidation:
"""
Easy to use and fast adversarial validation using LightGBM.
"""
DEFAULT_PARAMS = {'learning_rate': 0.01,
'objective': 'binary',
'metric': 'roc',
'boosting': 'gbdt',
'verbosity': 0,
'n_jobs': -1,
'force_col_wise': True}
def __init__(self) -> None:
pass
def validate(self, X_train, X_test, params=DEFAULT_PARAMS):
"""Performs adversarial validation and prints the result.
Args:
X_train (DataFrame): Training dataset
X_test (DataFrame): Test dataset
params (Dictionary, optional): Parameters for when running cross validation with LightGBM model. Defaults to DEFAULT_PARAMS.
"""
X_train["Target"] = 0
X_test["Target"] = 1
all_data = pd.concat([X_train, X_test], axis=0, ignore_index=True)
all_data = all_data.dropna()
features = list(all_data.columns[~all_data.columns.isin(['Target'])])
X = all_data.loc[:, features]
y = all_data["Target"]
LGBdata = lgb.Dataset(X, y, feature_name=features)
# perform cross validation with LightGBM
cross_val_results = cv(train_set=LGBdata, params=params,
feature_name=features,
nfold=5, metrics="auc",
num_boost_round=100,
early_stopping_rounds=20)
# print out the final result
print("AUC Mean", (cross_val_results["auc-mean"])[-1])
print("AUC Std", (cross_val_results["auc-stdv"])[-1])
def get_stats(self, X_train, X_test):
"""Prints score of Kolmogorov-Smirnov test for each feature
Args:
X_train (DataFrame): Training dataset
X_test (DataFrame): Test dataset
"""
features_list = X_train.columns.values.tolist()
for feature in features_list:
statistic, p_value = stats.kstest(X_train[feature], X_test[feature])
print("KS test value: %.3f" % statistic, "with p-value %.2f" %
p_value, "for ", feature)
def list_significant_features(self, X_train, X_test, stat_base=0.3,pval_base=0.05 ):
"""Prints scores of Kolmogorov-Smirnov test for only features that have siginificant value
Args:
X_train (DataFrame): Training dataset
X_test (DataFrame): Test dataset
stat_base (float, optional): Statistic baseline of Kolmogorov-Smirnov test. Defaults to 0.3.
pval_base (float, optional): p value baseline. Defaults to 0.05.
"""
features_list = X_train.columns.values.tolist()
for feature in features_list:
statistic, p_value = stats.kstest(X_train[feature], X_test[feature])
if statistic > stat_base and p_value < pval_base:
print("KS test value: %.3f" % statistic, "with a p-value %.2f" %
p_value, "for ", feature)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment