jargnar/process_models.py

## process_models.py
def train(df: pd.DataFrame, ycol: str, event_id: str) -> tuple[CatBoostClassifier, TrainTestTargetDistribution]:
    df = get_quantile_binned_target(df, ycol)
    # oversample the minority class by simple duplication
    # we do this for stratified train test split strategy
    df = df.append(df[df.groupby(ycol)[ycol].transform("count") == 1])

    X: pd.DataFrame = df.drop(columns=[ycol])
    y: pd.Series = df[ycol]
    # boosters like catboost and xgboost need categorical variables to be encoded
    # catboost can do this automatically
    # but we need to tell it which columns are categorical
    catcols = X.select_dtypes(exclude=[np.number]).columns.to_list()
    X[catcols] = X[catcols].astype("category")
    _LOGGER.info(f"T:PMODEL S:4/7 event_id={event_id} catcols={catcols}")

    # X_train, X_test, y_train, y_test = train_test_splitter(X, y, test_size=0.3)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

    # save the original y_train and y_test
    original_ycol = f"{ycol}_original"
    original_y_train = X_train[original_ycol]
    original_y_test = X_test[original_ycol]
    # drop the original ycol from X_train and X_test
    X_train = X_train.drop(columns=[original_ycol])
    X_test = X_test.drop(columns=[original_ycol])
    train_test_target_distribution = TrainTestTargetDistribution(
        train_targets=original_y_train.to_list(),
        test_targets=original_y_test.to_list(),
        example_payload=X_train.iloc[0].to_dict(),
    )

    train_dir = tempfile.mkdtemp()
    model = CatBoostClassifier(train_dir=train_dir, silent=True, cat_features=catcols)

    model.fit(X_train, y_train, eval_set=(X_test, y_test))

    shutil.rmtree(train_dir)

    return (model, train_test_target_distribution)

 def get_quantile_binned_target(df: pd.DataFrame, ycol: str) -> pd.DataFrame:
    """
    Returns a dataframe with the target column quantile binned.

    Here's an example of what this function does:

    input:
    ---

        6502deb5b6b6605c4edbe841  6502deb5b6b6605c4edbe85c         y
    0                lzoqswqcur                  88.55250 -67.61000
    1                znbqnylced                 -28.91700  94.88640
    2                itkpskpjim                 -90.60000  12.00000
    3                yvbpjkwlxh                 -35.50090  15.78182
    4                qvaqoirvfx                  99.80000  -9.77100
    5                zhigqexmsv                  95.77939  25.39412
    6                nltepbcbsu                  69.46500 -81.42056
    7                iapvffgfkx                  -1.08777  38.00000
    8                shshwbovyl                 -69.51820  34.70000
    9                jugddlnxcl                  -6.47582  36.08890
    10               tlvpakccuz                 -49.00000 -84.00000
    11               tdmarbqybn                 -22.21000  64.81100
    12               kwfyxletbh                  -3.60846  31.72300

    output:
    ---

        6502deb5b6b6605c4edbe841  6502deb5b6b6605c4edbe85c                   y
    0                lzoqswqcur                  88.55250   -71.556 to -0.441
    1                znbqnylced                 -28.91700     45.66 to 94.886
    2                itkpskpjim                 -90.60000    -0.441 to 17.155
    3                yvbpjkwlxh                 -35.50090    -0.441 to 17.155
    4                qvaqoirvfx                  99.80000   -71.556 to -0.441
    5                zhigqexmsv                  95.77939    17.155 to 30.819
    6                nltepbcbsu                  69.46500  -84.001 to -71.556
    7                iapvffgfkx                  -1.08777     35.494 to 45.66
    8                shshwbovyl                 -69.51820    30.819 to 35.494
    9                jugddlnxcl                  -6.47582     35.494 to 45.66
    10               tlvpakccuz                 -49.00000  -84.001 to -71.556
    11               tdmarbqybn                 -22.21000     45.66 to 94.886
    12               kwfyxletbh                  -3.60846    30.819 to 35.494
    """
    if df.empty:
        raise ValueError("Empty dataframe")
    # pick the magic quantile number, q=7
    # its actually non trivial to automatically pick quantiles based on the distribution
    # some distributions are more skewed than others
    # but why pick quantile at all you might wonder...
    # we pick quantiles because we map the target to a categorical variable
    # and treat it as a classification problem
    # that gives us better accuracy, as of now
    # this is really MVP territory
    # ships > castles
    num_quantiles = min(7, df[ycol].nunique())
    quantiles = pd.qcut(df[ycol], q=num_quantiles, duplicates="drop")
    df[f"{ycol}_original"] = df[ycol]
    df[ycol] = quantiles.apply(lambda x: f"{x.left} to {x.right}")
    return df
	def train(df: pd.DataFrame, ycol: str, event_id: str) -> tuple[CatBoostClassifier, TrainTestTargetDistribution]:
	df = get_quantile_binned_target(df, ycol)
	# oversample the minority class by simple duplication
	# we do this for stratified train test split strategy
	df = df.append(df[df.groupby(ycol)[ycol].transform("count") == 1])

	X: pd.DataFrame = df.drop(columns=[ycol])
	y: pd.Series = df[ycol]
	# boosters like catboost and xgboost need categorical variables to be encoded
	# catboost can do this automatically
	# but we need to tell it which columns are categorical
	catcols = X.select_dtypes(exclude=[np.number]).columns.to_list()
	X[catcols] = X[catcols].astype("category")
	_LOGGER.info(f"T:PMODEL S:4/7 event_id={event_id} catcols={catcols}")

	# X_train, X_test, y_train, y_test = train_test_splitter(X, y, test_size=0.3)
	X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

	# save the original y_train and y_test
	original_ycol = f"{ycol}_original"
	original_y_train = X_train[original_ycol]
	original_y_test = X_test[original_ycol]
	# drop the original ycol from X_train and X_test
	X_train = X_train.drop(columns=[original_ycol])
	X_test = X_test.drop(columns=[original_ycol])
	train_test_target_distribution = TrainTestTargetDistribution(
	train_targets=original_y_train.to_list(),
	test_targets=original_y_test.to_list(),
	example_payload=X_train.iloc[0].to_dict(),
	)

	train_dir = tempfile.mkdtemp()
	model = CatBoostClassifier(train_dir=train_dir, silent=True, cat_features=catcols)

	model.fit(X_train, y_train, eval_set=(X_test, y_test))

	shutil.rmtree(train_dir)

	return (model, train_test_target_distribution)

	def get_quantile_binned_target(df: pd.DataFrame, ycol: str) -> pd.DataFrame:
	"""
	Returns a dataframe with the target column quantile binned.

	Here's an example of what this function does:

	input:
	---

	6502deb5b6b6605c4edbe841 6502deb5b6b6605c4edbe85c y
	0 lzoqswqcur 88.55250 -67.61000
	1 znbqnylced -28.91700 94.88640
	2 itkpskpjim -90.60000 12.00000
	3 yvbpjkwlxh -35.50090 15.78182
	4 qvaqoirvfx 99.80000 -9.77100
	5 zhigqexmsv 95.77939 25.39412
	6 nltepbcbsu 69.46500 -81.42056
	7 iapvffgfkx -1.08777 38.00000
	8 shshwbovyl -69.51820 34.70000
	9 jugddlnxcl -6.47582 36.08890
	10 tlvpakccuz -49.00000 -84.00000
	11 tdmarbqybn -22.21000 64.81100
	12 kwfyxletbh -3.60846 31.72300

	output:
	---

	6502deb5b6b6605c4edbe841 6502deb5b6b6605c4edbe85c y
	0 lzoqswqcur 88.55250 -71.556 to -0.441
	1 znbqnylced -28.91700 45.66 to 94.886
	2 itkpskpjim -90.60000 -0.441 to 17.155
	3 yvbpjkwlxh -35.50090 -0.441 to 17.155
	4 qvaqoirvfx 99.80000 -71.556 to -0.441
	5 zhigqexmsv 95.77939 17.155 to 30.819
	6 nltepbcbsu 69.46500 -84.001 to -71.556
	7 iapvffgfkx -1.08777 35.494 to 45.66
	8 shshwbovyl -69.51820 30.819 to 35.494
	9 jugddlnxcl -6.47582 35.494 to 45.66
	10 tlvpakccuz -49.00000 -84.001 to -71.556
	11 tdmarbqybn -22.21000 45.66 to 94.886
	12 kwfyxletbh -3.60846 30.819 to 35.494
	"""
	if df.empty:
	raise ValueError("Empty dataframe")
	# pick the magic quantile number, q=7
	# its actually non trivial to automatically pick quantiles based on the distribution
	# some distributions are more skewed than others
	# but why pick quantile at all you might wonder...
	# we pick quantiles because we map the target to a categorical variable
	# and treat it as a classification problem
	# that gives us better accuracy, as of now
	# this is really MVP territory
	# ships > castles
	num_quantiles = min(7, df[ycol].nunique())
	quantiles = pd.qcut(df[ycol], q=num_quantiles, duplicates="drop")
	df[f"{ycol}_original"] = df[ycol]
	df[ycol] = quantiles.apply(lambda x: f"{x.left} to {x.right}")
	return df