atorch/simple_prediction_interval.py

## simple_prediction_interval.py
"""
See https://stats.stackexchange.com/questions/524863/simple-constant-width-prediction-interval-for-a-regression-model
"""

import numpy as np
import pandas as pd
from catboost import CatBoostRegressor


def simulate_dataframe(n_obs=1000):

    x1 = np.random.uniform(size=n_obs)

    # A triangle distribution
    x2 = np.random.uniform(size=n_obs) + np.random.uniform(size=n_obs)

    x3 = x1 + x2 + np.random.normal(size=n_obs)

    # The regression error term (not Gaussian)
    epsilon = (
        np.random.uniform(size=n_obs, low=-1, high=1)
        + np.random.uniform(size=n_obs, low=-1, high=1)
        + np.random.uniform(size=n_obs, low=-1, high=1)
    )

    # The true regression equation
    y = 10 + x1 + 5 * x2 + 8 * x3 + x1 * x2 + epsilon

    return pd.DataFrame({"y": y, "x1": x1, "x2": x2, "x3": x3})


def main():

    np.random.seed(9876543)

    train = simulate_dataframe(n_obs=2000)

    model = CatBoostRegressor(iterations=100, learning_rate=0.1, loss_function="RMSE")

    predictors = ["x1", "x2", "x3"]
    X_train = train[predictors]

    # Model fit to the training set
    model.fit(X=X_train, y=train["y"])

    test = simulate_dataframe(n_obs=2000)

    X_test = test[predictors]
    test["prediction"] = model.predict(X_test)

    test["error"] = test["y"] - test["prediction"]

    # Error quantiles learned from the test set (using both the test set and the model)
    error_quantiles = np.quantile(
        test["error"], q=[0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95]
    )
    print(error_quantiles)

    production = simulate_dataframe(n_obs=10000)

    X_production = production[predictors]
    production["prediction"] = model.predict(X_production)

    # Now we need prediction intervals that work (i.e. have the desired coverage) in production
    for desired_coverage in [95, 90, 80, 50]:

        q_lower = (1 - (desired_coverage / 100.0)) / 2
        q_upper = (desired_coverage / 100.0) + q_lower
        production[f"interval_{desired_coverage}_upper"] = production[
            "prediction"
        ] + np.quantile(test["error"], q=q_upper)
        production[f"interval_{desired_coverage}_lower"] = production[
            "prediction"
        ] + np.quantile(test["error"], q=q_lower)

        fraction_in_interval = np.mean(
            np.logical_and(
                production["y"] > production[f"interval_{desired_coverage}_lower"],
                production["y"] < production[f"interval_{desired_coverage}_upper"],
            )
        )

        print(
            f"Fraction of production Ys in {desired_coverage}% interval: {fraction_in_interval}"
        )

        # These prediction intervals have constant width. Will their coverage be higher or lower
        # depending on X? Intuitively, prediction intervals should be wider for Xs that are
        # at the edges of (or extrapolating outside of) the range seen during training, so
        # intervals with a constant width (wrt X) should have poor coverage at the edges
        x2_threshold = 1.8
        production_subset = production.loc[production["x2"] > x2_threshold]

        fraction_in_interval = np.mean(
            np.logical_and(
                production_subset["y"] > production_subset[f"interval_{desired_coverage}_lower"],
                production_subset["y"] < production_subset[f"interval_{desired_coverage}_upper"],
            )
        )

        print(
            f"Fraction of production Ys in {desired_coverage}% interval "
            f"(conditional on x2 > {x2_threshold}):"
            f" {fraction_in_interval}"
        )


if __name__ == "__main__":
    main()
	"""
	See https://stats.stackexchange.com/questions/524863/simple-constant-width-prediction-interval-for-a-regression-model
	"""

	import numpy as np
	import pandas as pd
	from catboost import CatBoostRegressor


	def simulate_dataframe(n_obs=1000):

	x1 = np.random.uniform(size=n_obs)

	# A triangle distribution
	x2 = np.random.uniform(size=n_obs) + np.random.uniform(size=n_obs)

	x3 = x1 + x2 + np.random.normal(size=n_obs)

	# The regression error term (not Gaussian)
	epsilon = (
	np.random.uniform(size=n_obs, low=-1, high=1)
	+ np.random.uniform(size=n_obs, low=-1, high=1)
	+ np.random.uniform(size=n_obs, low=-1, high=1)
	)

	# The true regression equation
	y = 10 + x1 + 5 * x2 + 8 * x3 + x1 * x2 + epsilon

	return pd.DataFrame({"y": y, "x1": x1, "x2": x2, "x3": x3})


	def main():

	np.random.seed(9876543)

	train = simulate_dataframe(n_obs=2000)

	model = CatBoostRegressor(iterations=100, learning_rate=0.1, loss_function="RMSE")

	predictors = ["x1", "x2", "x3"]
	X_train = train[predictors]

	# Model fit to the training set
	model.fit(X=X_train, y=train["y"])

	test = simulate_dataframe(n_obs=2000)

	X_test = test[predictors]
	test["prediction"] = model.predict(X_test)

	test["error"] = test["y"] - test["prediction"]

	# Error quantiles learned from the test set (using both the test set and the model)
	error_quantiles = np.quantile(
	test["error"], q=[0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95]
	)
	print(error_quantiles)

	production = simulate_dataframe(n_obs=10000)

	X_production = production[predictors]
	production["prediction"] = model.predict(X_production)

	# Now we need prediction intervals that work (i.e. have the desired coverage) in production
	for desired_coverage in [95, 90, 80, 50]:

	q_lower = (1 - (desired_coverage / 100.0)) / 2
	q_upper = (desired_coverage / 100.0) + q_lower
	production[f"interval_{desired_coverage}_upper"] = production[
	"prediction"
	] + np.quantile(test["error"], q=q_upper)
	production[f"interval_{desired_coverage}_lower"] = production[
	"prediction"
	] + np.quantile(test["error"], q=q_lower)

	fraction_in_interval = np.mean(
	np.logical_and(
	production["y"] > production[f"interval_{desired_coverage}_lower"],
	production["y"] < production[f"interval_{desired_coverage}_upper"],
	)
	)

	print(
	f"Fraction of production Ys in {desired_coverage}% interval: {fraction_in_interval}"
	)

	# These prediction intervals have constant width. Will their coverage be higher or lower
	# depending on X? Intuitively, prediction intervals should be wider for Xs that are
	# at the edges of (or extrapolating outside of) the range seen during training, so
	# intervals with a constant width (wrt X) should have poor coverage at the edges
	x2_threshold = 1.8
	production_subset = production.loc[production["x2"] > x2_threshold]

	fraction_in_interval = np.mean(
	np.logical_and(
	production_subset["y"] > production_subset[f"interval_{desired_coverage}_lower"],
	production_subset["y"] < production_subset[f"interval_{desired_coverage}_upper"],
	)
	)

	print(
	f"Fraction of production Ys in {desired_coverage}% interval "
	f"(conditional on x2 > {x2_threshold}):"
	f" {fraction_in_interval}"
	)


	if __name__ == "__main__":
	main()