Skip to content

Instantly share code, notes, and snippets.

@jargnar
Created January 9, 2024 13:37
Show Gist options
  • Save jargnar/681e6815ac4e66ba1fb15047a2ff248a to your computer and use it in GitHub Desktop.
Save jargnar/681e6815ac4e66ba1fb15047a2ff248a to your computer and use it in GitHub Desktop.
def train(df: pd.DataFrame, ycol: str, event_id: str) -> tuple[CatBoostClassifier, TrainTestTargetDistribution]:
df = get_quantile_binned_target(df, ycol)
# oversample the minority class by simple duplication
# we do this for stratified train test split strategy
df = df.append(df[df.groupby(ycol)[ycol].transform("count") == 1])
X: pd.DataFrame = df.drop(columns=[ycol])
y: pd.Series = df[ycol]
# boosters like catboost and xgboost need categorical variables to be encoded
# catboost can do this automatically
# but we need to tell it which columns are categorical
catcols = X.select_dtypes(exclude=[np.number]).columns.to_list()
X[catcols] = X[catcols].astype("category")
_LOGGER.info(f"T:PMODEL S:4/7 event_id={event_id} catcols={catcols}")
# X_train, X_test, y_train, y_test = train_test_splitter(X, y, test_size=0.3)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# save the original y_train and y_test
original_ycol = f"{ycol}_original"
original_y_train = X_train[original_ycol]
original_y_test = X_test[original_ycol]
# drop the original ycol from X_train and X_test
X_train = X_train.drop(columns=[original_ycol])
X_test = X_test.drop(columns=[original_ycol])
train_test_target_distribution = TrainTestTargetDistribution(
train_targets=original_y_train.to_list(),
test_targets=original_y_test.to_list(),
example_payload=X_train.iloc[0].to_dict(),
)
train_dir = tempfile.mkdtemp()
model = CatBoostClassifier(train_dir=train_dir, silent=True, cat_features=catcols)
model.fit(X_train, y_train, eval_set=(X_test, y_test))
shutil.rmtree(train_dir)
return (model, train_test_target_distribution)
def get_quantile_binned_target(df: pd.DataFrame, ycol: str) -> pd.DataFrame:
"""
Returns a dataframe with the target column quantile binned.
Here's an example of what this function does:
input:
---
6502deb5b6b6605c4edbe841 6502deb5b6b6605c4edbe85c y
0 lzoqswqcur 88.55250 -67.61000
1 znbqnylced -28.91700 94.88640
2 itkpskpjim -90.60000 12.00000
3 yvbpjkwlxh -35.50090 15.78182
4 qvaqoirvfx 99.80000 -9.77100
5 zhigqexmsv 95.77939 25.39412
6 nltepbcbsu 69.46500 -81.42056
7 iapvffgfkx -1.08777 38.00000
8 shshwbovyl -69.51820 34.70000
9 jugddlnxcl -6.47582 36.08890
10 tlvpakccuz -49.00000 -84.00000
11 tdmarbqybn -22.21000 64.81100
12 kwfyxletbh -3.60846 31.72300
output:
---
6502deb5b6b6605c4edbe841 6502deb5b6b6605c4edbe85c y
0 lzoqswqcur 88.55250 -71.556 to -0.441
1 znbqnylced -28.91700 45.66 to 94.886
2 itkpskpjim -90.60000 -0.441 to 17.155
3 yvbpjkwlxh -35.50090 -0.441 to 17.155
4 qvaqoirvfx 99.80000 -71.556 to -0.441
5 zhigqexmsv 95.77939 17.155 to 30.819
6 nltepbcbsu 69.46500 -84.001 to -71.556
7 iapvffgfkx -1.08777 35.494 to 45.66
8 shshwbovyl -69.51820 30.819 to 35.494
9 jugddlnxcl -6.47582 35.494 to 45.66
10 tlvpakccuz -49.00000 -84.001 to -71.556
11 tdmarbqybn -22.21000 45.66 to 94.886
12 kwfyxletbh -3.60846 30.819 to 35.494
"""
if df.empty:
raise ValueError("Empty dataframe")
# pick the magic quantile number, q=7
# its actually non trivial to automatically pick quantiles based on the distribution
# some distributions are more skewed than others
# but why pick quantile at all you might wonder...
# we pick quantiles because we map the target to a categorical variable
# and treat it as a classification problem
# that gives us better accuracy, as of now
# this is really MVP territory
# ships > castles
num_quantiles = min(7, df[ycol].nunique())
quantiles = pd.qcut(df[ycol], q=num_quantiles, duplicates="drop")
df[f"{ycol}_original"] = df[ycol]
df[ycol] = quantiles.apply(lambda x: f"{x.left} to {x.right}")
return df
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment