MartinNowak/Scikit-HTS.py

## errors.md

      
    Raw
  

              errors.md
            
          
total
states
stores
cats


OLS
agg_diff
0.926737
0.922922
0.925042
0.958276


r2
-6.790550
-0.134777
0.861383
0.961364


WLSS
agg_diff
0.926805
0.922828
0.925050
0.957586


r2
-6.780770
-0.137587
0.860504
0.961340


our-middle-out
agg_diff
0.927709
0.923106
0.925246
0.958024


r2
-6.680218
-0.137359
0.861531
0.961585


no-reconciliation
agg_diff
0.926443
0.923957
0.925246
0.956866


r2
-6.824754
-0.120379
0.861531
0.960979


## Scikit-HTS.py
#!/usr/bin/env python
# coding: utf-8

# # scikit-hts showcase: [M5 kaggle competition](https://www.kaggle.com/c/m5-forecasting-accuracy)
#
# In this notebook we will use the data from the Kaggle competiton to perform some hierarchical forecasting. The problem is particularly well suited for the library, as there is a clear hierarchical relationship between each of the series: we have states, stores, categories, departments, and items; sum of sales of items resolve to departments, which summed resolved to categories and so forth.
#
#
# We will however limit the scope of the forecasting task to producing forecasts at the
# department level, rather than going to the full extent and forecasting for single items.
#
# The reasons for this is that this notebook is designed for exemplification purposes, rather than providing a workable solution for the challenge.

# See [Local runtimes](https://research.google.com/colaboratory/local-runtimes.html)

import os

# os.system('pip install matplotlib pandas scikit-hts kaggle')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import hts

from sklearn.pipeline import Pipeline
from prophet import Prophet

# Download the raw files into this directory
data = "data"


# ## Transform data in the format required by scikit-hts

# We will build a hierarchy tree that will look like the following:
#
#
#     Level                             Node Key                          # of nodes
#
#       1                                total                                  1
#
#       2                 CA               TX                   WI              3
#
#       3           CA_1 CA_2 CA_3   TX_1 TX_2 TX_3        WI_1 WI_2 WI_3       6
#
#       4      CA_1_HOBBIES_1 ....   TX_1_HOBBIES_1...    ... WI_3_FOODS_3     70
#       ...
#
#
#
# The steps are the following:
#
# 1. Transform dataset into a column oriented one
# 2. Create the hierarchy representation as a dictionary
#
# For a complete description of how that is done under the hood, and for a sense of what the API accepts, see [scikit-hts' docs](https://scikit-hts.readthedocs.io/en/latest/hierarchy.html)


def get_data():
    os.system(" mkdir -p data")
    os.system(
        " cd data && kaggle competitions download -c m5-forecasting-accuracy && unzip -o m5-forecasting-accuracy.zip"
    )


calendar = pd.read_csv(os.path.join(data, "calendar.csv"), index_col="d")


def preprocess(path: str) -> tuple[pd.DataFrame, np.ndarray, list[str]]:
    df = pd.read_csv(path).drop(columns=["item_id", "id"])
    id_cols = [col for col in df.columns if not col.startswith("d_")]
    df.columns = [
        pd.to_datetime(calendar.loc[col].date) if col not in id_cols else col
        for col in df.columns
    ]
    df = df.melt(id_vars=id_cols, var_name="date", value_name="sales")

    # aggregate item_id to reduce number of models
    hierarchy = ["state_id", "store_id", "cat_id", "dept_id"]
    df = df.groupby(hierarchy + ["date"], as_index=False).sum()

    # get rid of underscores to avoid colliding with hts's internal underscore concatenation
    for col in hierarchy:
        df[col] = df[col].str.replace("_", "")

    return hts.functions.get_hierarchichal_df(
        df,
        level_names=hierarchy,
        hierarchy=[hierarchy[0 : i + 1] for i in range(len(hierarchy) - 1)],
        date_colname="date",
        val_colname="sales",
    )


def read_data(path: str) -> tuple[pd.DataFrame, np.ndarray, list[str]]:
    if not os.path.exists(path):
        get_data()
    return preprocess(path)


df, sum_mat, sum_mat_labels = read_data(
    os.path.join(data, "sales_train_evaluation.csv")
)
# reorder columns hierarchically
df = df[sum_mat_labels]
train_df, eval_df = df[:-28], df[-28:]
train_df.head()

forecasts = pd.DataFrame(index=eval_df.index, columns=eval_df.columns)
for col in sum_mat_labels:
    m = Prophet()
    m.fit(train_df[col].reset_index().rename(columns={"date": "ds", col: "y"}))
    forecasts[col] = m.predict(
        forecasts[col].reset_index().rename(columns={"date": "ds"})
    ).yhat.values

# reconciliation
errors = pd.DataFrame()
for method in ["OLS", "WLSS"]:
    revised_forecasts = pd.DataFrame(
        data=hts.functions.optimal_combination(
            {
                col: pd.DataFrame(data=forecasts[col].values, columns=["yhat"])
                for col in sum_mat_labels
            },
            sum_mat,
            method=method,
            mse={},
        ),
        index=forecasts.index,
        columns=sum_mat_labels,
    )

    agg_diff = revised_forecasts.sum() / eval_df.sum()
    r2 = 1 - ((revised_forecasts - eval_df) ** 2).sum() / eval_df.var().sum()
    errors = pd.concat(
        (
            errors,
            pd.concat(
                (agg_diff, r2), axis=1, keys=[(method, "agg_diff"), (method, "r2")]
            ),
        ),
        axis=1,
    )


# compared to our middle-out approach
revised_forecasts = forecasts.copy()
# bottom-up part
states = revised_forecasts.filter(regex="^\w\w$").columns
revised_forecasts["total"] = revised_forecasts[states].sum(axis=1)
for state in states:
    revised_forecasts[state] = revised_forecasts.filter(
        regex=f"^{state}_{state}\d$"
    ).sum(axis=1)

# top-down part
stores = revised_forecasts.filter(regex="^\w\w_\w\w\d$").columns
for store in stores:
    cats = revised_forecasts.filter(regex=f"^{store}_[^_]*$").columns
    pred_sums = revised_forecasts[cats].sum()
    pred_shares = pred_sums / pred_sums.sum()
    for cat in cats:
        revised_forecasts[cat] = revised_forecasts[store] * pred_shares[cat]

cats = revised_forecasts.filter(regex="^\w\w_\w\w\d_[^_]*$").columns
for cat in cats:
    depts = revised_forecasts.filter(regex=f"^{cat}_").columns
    pred_sums = revised_forecasts[depts].sum()
    pred_shares = pred_sums / pred_sums.sum()
    for dept in depts:
        revised_forecasts[dept] = revised_forecasts[cat] * pred_shares[dept]

method = "our-middle-out"
agg_diff = revised_forecasts.sum() / eval_df.sum()
r2 = 1 - ((revised_forecasts - eval_df) ** 2).sum() / eval_df.var().sum()
errors = pd.concat(
    (
        errors,
        pd.concat((agg_diff, r2), axis=1, keys=[(method, "agg_diff"), (method, "r2")]),
    ),
    axis=1,
)

method = "no-reconciliation"
agg_diff = forecasts.sum() / eval_df.sum()
r2 = 1 - ((forecasts - eval_df) ** 2).sum() / eval_df.var().sum()
errors = pd.concat(
    (
        errors,
        pd.concat((agg_diff, r2), axis=1, keys=[(method, "agg_diff"), (method, "r2")]),
    ),
    axis=1,
)

print(
    pd.concat(
        [
            errors.loc["total"].rename("total"),
            errors.loc[states].mean().rename("states"),
            errors.loc[stores].mean().rename("stores"),
            errors.loc[cats].mean().rename("cats"),
        ],
        axis=1,
    )
)
		total	states	stores	cats
OLS	agg_diff	0.926737	0.922922	0.925042	0.958276
	r2	-6.790550	-0.134777	0.861383	0.961364
WLSS	agg_diff	0.926805	0.922828	0.925050	0.957586
	r2	-6.780770	-0.137587	0.860504	0.961340
our-middle-out	agg_diff	0.927709	0.923106	0.925246	0.958024
	r2	-6.680218	-0.137359	0.861531	0.961585
no-reconciliation	agg_diff	0.926443	0.923957	0.925246	0.956866
	r2	-6.824754	-0.120379	0.861531	0.960979
	#!/usr/bin/env python
	# coding: utf-8

	# # scikit-hts showcase: [M5 kaggle competition](https://www.kaggle.com/c/m5-forecasting-accuracy)
	#
	# In this notebook we will use the data from the Kaggle competiton to perform some hierarchical forecasting. The problem is particularly well suited for the library, as there is a clear hierarchical relationship between each of the series: we have states, stores, categories, departments, and items; sum of sales of items resolve to departments, which summed resolved to categories and so forth.
	#
	#
	# We will however limit the scope of the forecasting task to producing forecasts at the
	# department level, rather than going to the full extent and forecasting for single items.
	#
	# The reasons for this is that this notebook is designed for exemplification purposes, rather than providing a workable solution for the challenge.

	# See [Local runtimes](https://research.google.com/colaboratory/local-runtimes.html)

	import os

	# os.system('pip install matplotlib pandas scikit-hts kaggle')

	import pandas as pd
	import numpy as np
	import matplotlib.pyplot as plt
	import hts

	from sklearn.pipeline import Pipeline
	from prophet import Prophet

	# Download the raw files into this directory
	data = "data"


	# ## Transform data in the format required by scikit-hts

	# We will build a hierarchy tree that will look like the following:
	#
	#
	# Level Node Key # of nodes
	#
	# 1 total 1
	#
	# 2 CA TX WI 3
	#
	# 3 CA_1 CA_2 CA_3 TX_1 TX_2 TX_3 WI_1 WI_2 WI_3 6
	#
	# 4 CA_1_HOBBIES_1 .... TX_1_HOBBIES_1... ... WI_3_FOODS_3 70
	# ...
	#
	#
	#
	# The steps are the following:
	#
	# 1. Transform dataset into a column oriented one
	# 2. Create the hierarchy representation as a dictionary
	#
	# For a complete description of how that is done under the hood, and for a sense of what the API accepts, see [scikit-hts' docs](https://scikit-hts.readthedocs.io/en/latest/hierarchy.html)


	def get_data():
	os.system(" mkdir -p data")
	os.system(
	" cd data && kaggle competitions download -c m5-forecasting-accuracy && unzip -o m5-forecasting-accuracy.zip"
	)


	calendar = pd.read_csv(os.path.join(data, "calendar.csv"), index_col="d")


	def preprocess(path: str) -> tuple[pd.DataFrame, np.ndarray, list[str]]:
	df = pd.read_csv(path).drop(columns=["item_id", "id"])
	id_cols = [col for col in df.columns if not col.startswith("d_")]
	df.columns = [
	pd.to_datetime(calendar.loc[col].date) if col not in id_cols else col
	for col in df.columns
	]
	df = df.melt(id_vars=id_cols, var_name="date", value_name="sales")

	# aggregate item_id to reduce number of models
	hierarchy = ["state_id", "store_id", "cat_id", "dept_id"]
	df = df.groupby(hierarchy + ["date"], as_index=False).sum()

	# get rid of underscores to avoid colliding with hts's internal underscore concatenation
	for col in hierarchy:
	df[col] = df[col].str.replace("_", "")

	return hts.functions.get_hierarchichal_df(
	df,
	level_names=hierarchy,
	hierarchy=[hierarchy[0 : i + 1] for i in range(len(hierarchy) - 1)],
	date_colname="date",
	val_colname="sales",
	)


	def read_data(path: str) -> tuple[pd.DataFrame, np.ndarray, list[str]]:
	if not os.path.exists(path):
	get_data()
	return preprocess(path)


	df, sum_mat, sum_mat_labels = read_data(
	os.path.join(data, "sales_train_evaluation.csv")
	)
	# reorder columns hierarchically
	df = df[sum_mat_labels]
	train_df, eval_df = df[:-28], df[-28:]
	train_df.head()

	forecasts = pd.DataFrame(index=eval_df.index, columns=eval_df.columns)
	for col in sum_mat_labels:
	m = Prophet()
	m.fit(train_df[col].reset_index().rename(columns={"date": "ds", col: "y"}))
	forecasts[col] = m.predict(
	forecasts[col].reset_index().rename(columns={"date": "ds"})
	).yhat.values

	# reconciliation
	errors = pd.DataFrame()
	for method in ["OLS", "WLSS"]:
	revised_forecasts = pd.DataFrame(
	data=hts.functions.optimal_combination(
	{
	col: pd.DataFrame(data=forecasts[col].values, columns=["yhat"])
	for col in sum_mat_labels
	},
	sum_mat,
	method=method,
	mse={},
	),
	index=forecasts.index,
	columns=sum_mat_labels,
	)

	agg_diff = revised_forecasts.sum() / eval_df.sum()
	r2 = 1 - ((revised_forecasts - eval_df) ** 2).sum() / eval_df.var().sum()
	errors = pd.concat(
	(
	errors,
	pd.concat(
	(agg_diff, r2), axis=1, keys=[(method, "agg_diff"), (method, "r2")]
	),
	),
	axis=1,
	)


	# compared to our middle-out approach
	revised_forecasts = forecasts.copy()
	# bottom-up part
	states = revised_forecasts.filter(regex="^\w\w$").columns
	revised_forecasts["total"] = revised_forecasts[states].sum(axis=1)
	for state in states:
	revised_forecasts[state] = revised_forecasts.filter(
	regex=f"^{state}_{state}\d$"
	).sum(axis=1)

	# top-down part
	stores = revised_forecasts.filter(regex="^\w\w_\w\w\d$").columns
	for store in stores:
	cats = revised_forecasts.filter(regex=f"^{store}_[^_]*$").columns
	pred_sums = revised_forecasts[cats].sum()
	pred_shares = pred_sums / pred_sums.sum()
	for cat in cats:
	revised_forecasts[cat] = revised_forecasts[store] * pred_shares[cat]

	cats = revised_forecasts.filter(regex="^\w\w_\w\w\d_[^_]*$").columns
	for cat in cats:
	depts = revised_forecasts.filter(regex=f"^{cat}_").columns
	pred_sums = revised_forecasts[depts].sum()
	pred_shares = pred_sums / pred_sums.sum()
	for dept in depts:
	revised_forecasts[dept] = revised_forecasts[cat] * pred_shares[dept]

	method = "our-middle-out"
	agg_diff = revised_forecasts.sum() / eval_df.sum()
	r2 = 1 - ((revised_forecasts - eval_df) ** 2).sum() / eval_df.var().sum()
	errors = pd.concat(
	(
	errors,
	pd.concat((agg_diff, r2), axis=1, keys=[(method, "agg_diff"), (method, "r2")]),
	),
	axis=1,
	)

	method = "no-reconciliation"
	agg_diff = forecasts.sum() / eval_df.sum()
	r2 = 1 - ((forecasts - eval_df) ** 2).sum() / eval_df.var().sum()
	errors = pd.concat(
	(
	errors,
	pd.concat((agg_diff, r2), axis=1, keys=[(method, "agg_diff"), (method, "r2")]),
	),
	axis=1,
	)

	print(
	pd.concat(
	[
	errors.loc["total"].rename("total"),
	errors.loc[states].mean().rename("states"),
	errors.loc[stores].mean().rename("stores"),
	errors.loc[cats].mean().rename("cats"),
	],
	axis=1,
	)
	)