Last active
June 25, 2021 21:22
-
-
Save atorch/ee2caf3b81156273e9df3947c8f8b854 to your computer and use it in GitHub Desktop.
Simple constant-width prediction intervals
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
See https://stats.stackexchange.com/questions/524863/simple-constant-width-prediction-interval-for-a-regression-model | |
""" | |
import numpy as np | |
import pandas as pd | |
from catboost import CatBoostRegressor | |
def simulate_dataframe(n_obs=1000): | |
x1 = np.random.uniform(size=n_obs) | |
# A triangle distribution | |
x2 = np.random.uniform(size=n_obs) + np.random.uniform(size=n_obs) | |
x3 = x1 + x2 + np.random.normal(size=n_obs) | |
# The regression error term (not Gaussian) | |
epsilon = ( | |
np.random.uniform(size=n_obs, low=-1, high=1) | |
+ np.random.uniform(size=n_obs, low=-1, high=1) | |
+ np.random.uniform(size=n_obs, low=-1, high=1) | |
) | |
# The true regression equation | |
y = 10 + x1 + 5 * x2 + 8 * x3 + x1 * x2 + epsilon | |
return pd.DataFrame({"y": y, "x1": x1, "x2": x2, "x3": x3}) | |
def main(): | |
np.random.seed(9876543) | |
train = simulate_dataframe(n_obs=2000) | |
model = CatBoostRegressor(iterations=100, learning_rate=0.1, loss_function="RMSE") | |
predictors = ["x1", "x2", "x3"] | |
X_train = train[predictors] | |
# Model fit to the training set | |
model.fit(X=X_train, y=train["y"]) | |
test = simulate_dataframe(n_obs=2000) | |
X_test = test[predictors] | |
test["prediction"] = model.predict(X_test) | |
test["error"] = test["y"] - test["prediction"] | |
# Error quantiles learned from the test set (using both the test set and the model) | |
error_quantiles = np.quantile( | |
test["error"], q=[0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95] | |
) | |
print(error_quantiles) | |
production = simulate_dataframe(n_obs=10000) | |
X_production = production[predictors] | |
production["prediction"] = model.predict(X_production) | |
# Now we need prediction intervals that work (i.e. have the desired coverage) in production | |
for desired_coverage in [95, 90, 80, 50]: | |
q_lower = (1 - (desired_coverage / 100.0)) / 2 | |
q_upper = (desired_coverage / 100.0) + q_lower | |
production[f"interval_{desired_coverage}_upper"] = production[ | |
"prediction" | |
] + np.quantile(test["error"], q=q_upper) | |
production[f"interval_{desired_coverage}_lower"] = production[ | |
"prediction" | |
] + np.quantile(test["error"], q=q_lower) | |
fraction_in_interval = np.mean( | |
np.logical_and( | |
production["y"] > production[f"interval_{desired_coverage}_lower"], | |
production["y"] < production[f"interval_{desired_coverage}_upper"], | |
) | |
) | |
print( | |
f"Fraction of production Ys in {desired_coverage}% interval: {fraction_in_interval}" | |
) | |
# These prediction intervals have constant width. Will their coverage be higher or lower | |
# depending on X? Intuitively, prediction intervals should be wider for Xs that are | |
# at the edges of (or extrapolating outside of) the range seen during training, so | |
# intervals with a constant width (wrt X) should have poor coverage at the edges | |
x2_threshold = 1.8 | |
production_subset = production.loc[production["x2"] > x2_threshold] | |
fraction_in_interval = np.mean( | |
np.logical_and( | |
production_subset["y"] > production_subset[f"interval_{desired_coverage}_lower"], | |
production_subset["y"] < production_subset[f"interval_{desired_coverage}_upper"], | |
) | |
) | |
print( | |
f"Fraction of production Ys in {desired_coverage}% interval " | |
f"(conditional on x2 > {x2_threshold}):" | |
f" {fraction_in_interval}" | |
) | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment