Created
January 9, 2024 13:37
-
-
Save jargnar/681e6815ac4e66ba1fb15047a2ff248a to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def train(df: pd.DataFrame, ycol: str, event_id: str) -> tuple[CatBoostClassifier, TrainTestTargetDistribution]: | |
df = get_quantile_binned_target(df, ycol) | |
# oversample the minority class by simple duplication | |
# we do this for stratified train test split strategy | |
df = df.append(df[df.groupby(ycol)[ycol].transform("count") == 1]) | |
X: pd.DataFrame = df.drop(columns=[ycol]) | |
y: pd.Series = df[ycol] | |
# boosters like catboost and xgboost need categorical variables to be encoded | |
# catboost can do this automatically | |
# but we need to tell it which columns are categorical | |
catcols = X.select_dtypes(exclude=[np.number]).columns.to_list() | |
X[catcols] = X[catcols].astype("category") | |
_LOGGER.info(f"T:PMODEL S:4/7 event_id={event_id} catcols={catcols}") | |
# X_train, X_test, y_train, y_test = train_test_splitter(X, y, test_size=0.3) | |
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42) | |
# save the original y_train and y_test | |
original_ycol = f"{ycol}_original" | |
original_y_train = X_train[original_ycol] | |
original_y_test = X_test[original_ycol] | |
# drop the original ycol from X_train and X_test | |
X_train = X_train.drop(columns=[original_ycol]) | |
X_test = X_test.drop(columns=[original_ycol]) | |
train_test_target_distribution = TrainTestTargetDistribution( | |
train_targets=original_y_train.to_list(), | |
test_targets=original_y_test.to_list(), | |
example_payload=X_train.iloc[0].to_dict(), | |
) | |
train_dir = tempfile.mkdtemp() | |
model = CatBoostClassifier(train_dir=train_dir, silent=True, cat_features=catcols) | |
model.fit(X_train, y_train, eval_set=(X_test, y_test)) | |
shutil.rmtree(train_dir) | |
return (model, train_test_target_distribution) | |
def get_quantile_binned_target(df: pd.DataFrame, ycol: str) -> pd.DataFrame: | |
""" | |
Returns a dataframe with the target column quantile binned. | |
Here's an example of what this function does: | |
input: | |
--- | |
6502deb5b6b6605c4edbe841 6502deb5b6b6605c4edbe85c y | |
0 lzoqswqcur 88.55250 -67.61000 | |
1 znbqnylced -28.91700 94.88640 | |
2 itkpskpjim -90.60000 12.00000 | |
3 yvbpjkwlxh -35.50090 15.78182 | |
4 qvaqoirvfx 99.80000 -9.77100 | |
5 zhigqexmsv 95.77939 25.39412 | |
6 nltepbcbsu 69.46500 -81.42056 | |
7 iapvffgfkx -1.08777 38.00000 | |
8 shshwbovyl -69.51820 34.70000 | |
9 jugddlnxcl -6.47582 36.08890 | |
10 tlvpakccuz -49.00000 -84.00000 | |
11 tdmarbqybn -22.21000 64.81100 | |
12 kwfyxletbh -3.60846 31.72300 | |
output: | |
--- | |
6502deb5b6b6605c4edbe841 6502deb5b6b6605c4edbe85c y | |
0 lzoqswqcur 88.55250 -71.556 to -0.441 | |
1 znbqnylced -28.91700 45.66 to 94.886 | |
2 itkpskpjim -90.60000 -0.441 to 17.155 | |
3 yvbpjkwlxh -35.50090 -0.441 to 17.155 | |
4 qvaqoirvfx 99.80000 -71.556 to -0.441 | |
5 zhigqexmsv 95.77939 17.155 to 30.819 | |
6 nltepbcbsu 69.46500 -84.001 to -71.556 | |
7 iapvffgfkx -1.08777 35.494 to 45.66 | |
8 shshwbovyl -69.51820 30.819 to 35.494 | |
9 jugddlnxcl -6.47582 35.494 to 45.66 | |
10 tlvpakccuz -49.00000 -84.001 to -71.556 | |
11 tdmarbqybn -22.21000 45.66 to 94.886 | |
12 kwfyxletbh -3.60846 30.819 to 35.494 | |
""" | |
if df.empty: | |
raise ValueError("Empty dataframe") | |
# pick the magic quantile number, q=7 | |
# its actually non trivial to automatically pick quantiles based on the distribution | |
# some distributions are more skewed than others | |
# but why pick quantile at all you might wonder... | |
# we pick quantiles because we map the target to a categorical variable | |
# and treat it as a classification problem | |
# that gives us better accuracy, as of now | |
# this is really MVP territory | |
# ships > castles | |
num_quantiles = min(7, df[ycol].nunique()) | |
quantiles = pd.qcut(df[ycol], q=num_quantiles, duplicates="drop") | |
df[f"{ycol}_original"] = df[ycol] | |
df[ycol] = quantiles.apply(lambda x: f"{x.left} to {x.right}") | |
return df |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment