This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def xray(var): | |
"""Return name of variable and its state, | |
for logging purposes.""" | |
import inspect, re | |
string = inspect.getframeinfo( | |
inspect.getouterframes( | |
inspect.currentframe() | |
)[1][0]).code_context[0] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def regression_roc_auc_score(y_true, y_pred, num_rounds = 10000): | |
""" | |
Computes Regression-ROC-AUC-score. | |
Parameters: | |
---------- | |
y_true: array-like of shape (n_samples,). Binary or continuous target variable. | |
y_pred: array-like of shape (n_samples,). Target scores. | |
num_rounds: int or string. If integer, number of random pairs of observations. | |
If string, 'exact', all possible pairs of observations will be evaluated. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def naive_roc_auc_score(y_true, y_pred): | |
num_same_sign = 0 | |
num_pairs = 0 | |
for a in range(len(y_true)): | |
for b in range(len(y_true)): | |
if y_true[a] > y_true[b]: | |
num_pairs += 1 | |
if y_pred[a] > y_pred[b]: |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
from sklearn.feature_selection import f_regression | |
# inputs: | |
# X: pandas.DataFrame, features | |
# y: pandas.Series, target variable | |
# K: number of features to select | |
# compute F-statistics and initialize correlation matrix | |
F = pd.Series(f_regression(X, y)[0], index = X.columns) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
from sklearn.feature_selection import f_regression | |
# inputs: | |
# X: pandas.DataFrame, features | |
# y: pandas.Series, target variable | |
# K: number of features to select | |
# compute F-statistics and correlations | |
F = pd.Series(f_regression(X, y)[0], index = X.columns) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
y_level_ones = x.replace(y.groupby(x).apply(lambda l: (l == 1).sum())) | |
y_level_zeros = x.replace(y.groupby(x).apply(lambda l: (l == 0).sum())) | |
y_ones = (y == 1).sum() | |
y_zeros = (y == 0).sum() | |
nominator = y_level_ones / y_ones | |
denominator = y_level_zeros / y_zeros | |
woe_encoder = np.log(nominator / denominator) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
y_level_except_self = x.to_frame().apply( | |
lambda row: y[x == row['x']].drop(row.name).to_list(), | |
axis = 1 | |
) | |
leave_one_out_encoding = y_level_except_self.apply(np.mean) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
y_mean = y.mean() | |
y_level_before_self = x.to_frame().apply( | |
lambda row: y[(x == row['x']) & (y.index < row.name)].to_list(), | |
axis = 1 | |
) | |
catboost_encoding = y_level_before_self.apply( | |
lambda ylbs: (sum(ylbs) + y_mean * a) / (len(ylbs) + a) | |
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
model = smf.mixedlm(formula = 'y ~ 1', data = y.to_frame(), groups = x).fit() | |
intercept = model.params['Intercept'] | |
random_effect = x.replace({k: float(v) for k, v in model.random_effects.items()}) | |
glmm_encoding = intercept + random_effect |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
y_mean = y.mean() | |
y_var = y.var() | |
y_level_mean = x.replace(y.groupby(x).mean()) | |
y_level_var = x.replace(y.groupby(x).var()) | |
weight = 1 - (y_level_var / (y_var + y_level_var) * (len(set(x)) - 3) / (len(set(x)) - 1)) | |
james_stein_encoding = y_level_mean * weight + y_mean * (1 - weight) |
NewerOlder