Skip to content

Instantly share code, notes, and snippets.

@MartinNowak
Created March 11, 2022 15:46
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save MartinNowak/20f7bd4db1a356f296d0f479b0f07ad6 to your computer and use it in GitHub Desktop.
Save MartinNowak/20f7bd4db1a356f296d0f479b0f07ad6 to your computer and use it in GitHub Desktop.
hts for m5 dataset
total states stores cats
OLS agg_diff 0.926737 0.922922 0.925042 0.958276
r2 -6.790550 -0.134777 0.861383 0.961364
WLSS agg_diff 0.926805 0.922828 0.925050 0.957586
r2 -6.780770 -0.137587 0.860504 0.961340
our-middle-out agg_diff 0.927709 0.923106 0.925246 0.958024
r2 -6.680218 -0.137359 0.861531 0.961585
no-reconciliation agg_diff 0.926443 0.923957 0.925246 0.956866
r2 -6.824754 -0.120379 0.861531 0.960979
#!/usr/bin/env python
# coding: utf-8
# # scikit-hts showcase: [M5 kaggle competition](https://www.kaggle.com/c/m5-forecasting-accuracy)
#
# In this notebook we will use the data from the Kaggle competiton to perform some hierarchical forecasting. The problem is particularly well suited for the library, as there is a clear hierarchical relationship between each of the series: we have states, stores, categories, departments, and items; sum of sales of items resolve to departments, which summed resolved to categories and so forth.
#
#
# We will however limit the scope of the forecasting task to producing forecasts at the
# department level, rather than going to the full extent and forecasting for single items.
#
# The reasons for this is that this notebook is designed for exemplification purposes, rather than providing a workable solution for the challenge.
# See [Local runtimes](https://research.google.com/colaboratory/local-runtimes.html)
import os
# os.system('pip install matplotlib pandas scikit-hts kaggle')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import hts
from sklearn.pipeline import Pipeline
from prophet import Prophet
# Download the raw files into this directory
data = "data"
# ## Transform data in the format required by scikit-hts
# We will build a hierarchy tree that will look like the following:
#
#
# Level Node Key # of nodes
#
# 1 total 1
#
# 2 CA TX WI 3
#
# 3 CA_1 CA_2 CA_3 TX_1 TX_2 TX_3 WI_1 WI_2 WI_3 6
#
# 4 CA_1_HOBBIES_1 .... TX_1_HOBBIES_1... ... WI_3_FOODS_3 70
# ...
#
#
#
# The steps are the following:
#
# 1. Transform dataset into a column oriented one
# 2. Create the hierarchy representation as a dictionary
#
# For a complete description of how that is done under the hood, and for a sense of what the API accepts, see [scikit-hts' docs](https://scikit-hts.readthedocs.io/en/latest/hierarchy.html)
def get_data():
os.system(" mkdir -p data")
os.system(
" cd data && kaggle competitions download -c m5-forecasting-accuracy && unzip -o m5-forecasting-accuracy.zip"
)
calendar = pd.read_csv(os.path.join(data, "calendar.csv"), index_col="d")
def preprocess(path: str) -> tuple[pd.DataFrame, np.ndarray, list[str]]:
df = pd.read_csv(path).drop(columns=["item_id", "id"])
id_cols = [col for col in df.columns if not col.startswith("d_")]
df.columns = [
pd.to_datetime(calendar.loc[col].date) if col not in id_cols else col
for col in df.columns
]
df = df.melt(id_vars=id_cols, var_name="date", value_name="sales")
# aggregate item_id to reduce number of models
hierarchy = ["state_id", "store_id", "cat_id", "dept_id"]
df = df.groupby(hierarchy + ["date"], as_index=False).sum()
# get rid of underscores to avoid colliding with hts's internal underscore concatenation
for col in hierarchy:
df[col] = df[col].str.replace("_", "")
return hts.functions.get_hierarchichal_df(
df,
level_names=hierarchy,
hierarchy=[hierarchy[0 : i + 1] for i in range(len(hierarchy) - 1)],
date_colname="date",
val_colname="sales",
)
def read_data(path: str) -> tuple[pd.DataFrame, np.ndarray, list[str]]:
if not os.path.exists(path):
get_data()
return preprocess(path)
df, sum_mat, sum_mat_labels = read_data(
os.path.join(data, "sales_train_evaluation.csv")
)
# reorder columns hierarchically
df = df[sum_mat_labels]
train_df, eval_df = df[:-28], df[-28:]
train_df.head()
forecasts = pd.DataFrame(index=eval_df.index, columns=eval_df.columns)
for col in sum_mat_labels:
m = Prophet()
m.fit(train_df[col].reset_index().rename(columns={"date": "ds", col: "y"}))
forecasts[col] = m.predict(
forecasts[col].reset_index().rename(columns={"date": "ds"})
).yhat.values
# reconciliation
errors = pd.DataFrame()
for method in ["OLS", "WLSS"]:
revised_forecasts = pd.DataFrame(
data=hts.functions.optimal_combination(
{
col: pd.DataFrame(data=forecasts[col].values, columns=["yhat"])
for col in sum_mat_labels
},
sum_mat,
method=method,
mse={},
),
index=forecasts.index,
columns=sum_mat_labels,
)
agg_diff = revised_forecasts.sum() / eval_df.sum()
r2 = 1 - ((revised_forecasts - eval_df) ** 2).sum() / eval_df.var().sum()
errors = pd.concat(
(
errors,
pd.concat(
(agg_diff, r2), axis=1, keys=[(method, "agg_diff"), (method, "r2")]
),
),
axis=1,
)
# compared to our middle-out approach
revised_forecasts = forecasts.copy()
# bottom-up part
states = revised_forecasts.filter(regex="^\w\w$").columns
revised_forecasts["total"] = revised_forecasts[states].sum(axis=1)
for state in states:
revised_forecasts[state] = revised_forecasts.filter(
regex=f"^{state}_{state}\d$"
).sum(axis=1)
# top-down part
stores = revised_forecasts.filter(regex="^\w\w_\w\w\d$").columns
for store in stores:
cats = revised_forecasts.filter(regex=f"^{store}_[^_]*$").columns
pred_sums = revised_forecasts[cats].sum()
pred_shares = pred_sums / pred_sums.sum()
for cat in cats:
revised_forecasts[cat] = revised_forecasts[store] * pred_shares[cat]
cats = revised_forecasts.filter(regex="^\w\w_\w\w\d_[^_]*$").columns
for cat in cats:
depts = revised_forecasts.filter(regex=f"^{cat}_").columns
pred_sums = revised_forecasts[depts].sum()
pred_shares = pred_sums / pred_sums.sum()
for dept in depts:
revised_forecasts[dept] = revised_forecasts[cat] * pred_shares[dept]
method = "our-middle-out"
agg_diff = revised_forecasts.sum() / eval_df.sum()
r2 = 1 - ((revised_forecasts - eval_df) ** 2).sum() / eval_df.var().sum()
errors = pd.concat(
(
errors,
pd.concat((agg_diff, r2), axis=1, keys=[(method, "agg_diff"), (method, "r2")]),
),
axis=1,
)
method = "no-reconciliation"
agg_diff = forecasts.sum() / eval_df.sum()
r2 = 1 - ((forecasts - eval_df) ** 2).sum() / eval_df.var().sum()
errors = pd.concat(
(
errors,
pd.concat((agg_diff, r2), axis=1, keys=[(method, "agg_diff"), (method, "r2")]),
),
axis=1,
)
print(
pd.concat(
[
errors.loc["total"].rename("total"),
errors.loc[states].mean().rename("states"),
errors.loc[stores].mean().rename("stores"),
errors.loc[cats].mean().rename("cats"),
],
axis=1,
)
)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment