Skip to content

Instantly share code, notes, and snippets.

@atorch
Last active June 25, 2021 21:22
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save atorch/ee2caf3b81156273e9df3947c8f8b854 to your computer and use it in GitHub Desktop.
Save atorch/ee2caf3b81156273e9df3947c8f8b854 to your computer and use it in GitHub Desktop.
Simple constant-width prediction intervals
"""
See https://stats.stackexchange.com/questions/524863/simple-constant-width-prediction-interval-for-a-regression-model
"""
import numpy as np
import pandas as pd
from catboost import CatBoostRegressor
def simulate_dataframe(n_obs=1000):
x1 = np.random.uniform(size=n_obs)
# A triangle distribution
x2 = np.random.uniform(size=n_obs) + np.random.uniform(size=n_obs)
x3 = x1 + x2 + np.random.normal(size=n_obs)
# The regression error term (not Gaussian)
epsilon = (
np.random.uniform(size=n_obs, low=-1, high=1)
+ np.random.uniform(size=n_obs, low=-1, high=1)
+ np.random.uniform(size=n_obs, low=-1, high=1)
)
# The true regression equation
y = 10 + x1 + 5 * x2 + 8 * x3 + x1 * x2 + epsilon
return pd.DataFrame({"y": y, "x1": x1, "x2": x2, "x3": x3})
def main():
np.random.seed(9876543)
train = simulate_dataframe(n_obs=2000)
model = CatBoostRegressor(iterations=100, learning_rate=0.1, loss_function="RMSE")
predictors = ["x1", "x2", "x3"]
X_train = train[predictors]
# Model fit to the training set
model.fit(X=X_train, y=train["y"])
test = simulate_dataframe(n_obs=2000)
X_test = test[predictors]
test["prediction"] = model.predict(X_test)
test["error"] = test["y"] - test["prediction"]
# Error quantiles learned from the test set (using both the test set and the model)
error_quantiles = np.quantile(
test["error"], q=[0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95]
)
print(error_quantiles)
production = simulate_dataframe(n_obs=10000)
X_production = production[predictors]
production["prediction"] = model.predict(X_production)
# Now we need prediction intervals that work (i.e. have the desired coverage) in production
for desired_coverage in [95, 90, 80, 50]:
q_lower = (1 - (desired_coverage / 100.0)) / 2
q_upper = (desired_coverage / 100.0) + q_lower
production[f"interval_{desired_coverage}_upper"] = production[
"prediction"
] + np.quantile(test["error"], q=q_upper)
production[f"interval_{desired_coverage}_lower"] = production[
"prediction"
] + np.quantile(test["error"], q=q_lower)
fraction_in_interval = np.mean(
np.logical_and(
production["y"] > production[f"interval_{desired_coverage}_lower"],
production["y"] < production[f"interval_{desired_coverage}_upper"],
)
)
print(
f"Fraction of production Ys in {desired_coverage}% interval: {fraction_in_interval}"
)
# These prediction intervals have constant width. Will their coverage be higher or lower
# depending on X? Intuitively, prediction intervals should be wider for Xs that are
# at the edges of (or extrapolating outside of) the range seen during training, so
# intervals with a constant width (wrt X) should have poor coverage at the edges
x2_threshold = 1.8
production_subset = production.loc[production["x2"] > x2_threshold]
fraction_in_interval = np.mean(
np.logical_and(
production_subset["y"] > production_subset[f"interval_{desired_coverage}_lower"],
production_subset["y"] < production_subset[f"interval_{desired_coverage}_upper"],
)
)
print(
f"Fraction of production Ys in {desired_coverage}% interval "
f"(conditional on x2 > {x2_threshold}):"
f" {fraction_in_interval}"
)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment