glemaitre/plot_forecasting.py

## plot_forecasting.py
# %%
# Download the original dataset to be able to easily build an index with the
# original datetime.
# The dataset is available at:
# https://archive.ics.uci.edu/ml/machine-learning-databases/00275/Bike-Sharing-Dataset.zip
import pandas as pd

df_external = pd.read_csv(
    "~/Downloads/Bike-Sharing-Dataset/hour.csv",
    index_col=0,
)
df_external.head()

# %%
dteday = pd.to_datetime(df_external["dteday"])

# %%
time_index = pd.to_datetime(
    {
        "year": dteday.dt.year,
        "month": dteday.dt.month,
        "day": dteday.dt.day,
        "hour": df_external["hr"],
    }
)

# %%
# We create the same dataset than in the original example:
# https://scikit-learn.org/dev/auto_examples/applications/plot_cyclical_feature_engineering.html#sphx-glr-auto-examples-applications-plot-cyclical-feature-engineering-py
from sklearn.datasets import fetch_openml

bike_sharing = fetch_openml("Bike_Sharing_Demand", version=2, as_frame=True)
df = bike_sharing.frame

# %%
# To get a quick understanding of the periodic patterns of the data, let us
# have a look at the average demand per hour during a week.
#
# Note that the week starts on a Sunday, during the weekend. We can clearly
# distinguish the commute patterns in the morning and evenings of the work days
# and the leisure use of the bikes on the weekends with a more spread peak
# demand around the middle of the days:
import matplotlib.pyplot as plt


fig, ax = plt.subplots(figsize=(12, 4))
average_week_demand = df.groupby(["weekday", "hour"]).mean()["count"]
average_week_demand.plot(ax=ax)
_ = ax.set(
    title="Average hourly bike demand during the week",
    xticks=[i * 24 for i in range(7)],
    xticklabels=["Sun", "Mon", "Tue", "Wed", "Thu", "Fri", "Sat"],
    xlabel="Time of the week",
    ylabel="Number of bike rentals",
)

# %%
#
# The target of the prediction problem is the absolute count of bike rentals on
# a hourly basis:
df["count"].max()

# %%
#
# Let us rescale the target variable (number of hourly bike rentals) to predict
# a relative demand so that the mean absolute error is more easily interpreted
# as a fraction of the maximum demand.
#
# .. note::
#
#     The fit method of the models used in this notebook all minimize the
#     mean squared error to estimate the conditional mean instead of the mean
#     absolute error that would fit an estimator of the conditional median.
#
#     When reporting performance measure on the test set in the discussion, we
#     instead choose to focus on the mean absolute error that is more
#     intuitive than the (root) mean squared error. Note, however, that the
#     best models for one metric are also the best for the other in this
#     study.
y = df["count"] / 1000

# %%
fig, ax = plt.subplots(figsize=(12, 4))
y.hist(bins=30, ax=ax)
_ = ax.set(
    xlabel="Fraction of rented fleet demand",
    ylabel="Number of hours",
)

# %%
# The input feature data frame is a time annotated hourly log of variables
# describing the weather conditions. It includes both numerical and categorical
# variables. Note that the time information has already been expanded into
# several complementary columns.
#
X = df.drop("count", axis="columns")
X

# %%
# .. note::
#
#    If the time information was only present as a date or datetime column, we
#    could have expanded it into hour-in-the-day, day-in-the-week,
#    day-in-the-month, month-in-the-year using pandas:
#    https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#time-date-components
#
# We now introspect the distribution of the categorical variables, starting
# with `"weather"`:
#
X["weather"].value_counts()

# %%
# Since there are only 3 `"heavy_rain"` events, we cannot use this category to
# train machine learning models with cross validation. Instead, we simplify the
# representation by collapsing those into the `"rain"` category.
#
X["weather"].replace(to_replace="heavy_rain", value="rain", inplace=True)
# %%
X["weather"].value_counts()

# %%
X.tail()

# %%
X.index = pd.Index(time_index).to_period("H")

# %%
X.head()

# %%
y.index = pd.Index(time_index).to_period("H")

# %%
# From now on, we will try to use sktime to to process the data
# as a forecasting problem. Indeed, we want a model that will
# augment the data by addind information from past samples.
#
# First, let's split the data into 2 sets: 2010 that will be used
# for training and 2011 that will be used for testing.
from sktime.forecasting.model_selection import temporal_train_test_split

y_train, y_test, X_train, X_test = temporal_train_test_split(
    y, X, test_size=0.5
)

# %%
# We will start by creating a dummy forecaster that will predict the
# latest values seen in the time series.
from sktime.forecasting.naive import NaiveForecaster

forecaster = NaiveForecaster()
forecaster.fit(y_train)

# %%
# The way of doing prediction is a not straightforward. One needs to
# create a ForecastHorizon object that take the time index for which
# one wants the prediction. This will enforce your time index to be
# a regularly spaced period.
from sktime.forecasting.base import ForecastingHorizon

fh = ForecastingHorizon(
    y_test.index, is_relative=False
)

forecaster.predict(fh)

# %%
# Now, let's create a preprocessor to handle the heterogeneous data
# in X. We also create an HGBRT as a predictor.
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import HistGradientBoostingRegressor

categorical_columns = [
    "weather",
    "season",
    "holiday",
    "workingday",
]
categories = [
    ["clear", "misty", "rain"],
    ["spring", "summer", "fall", "winter"],
    ["False", "True"],
    ["False", "True"],
]
ordinal_encoder = OrdinalEncoder(categories=categories)
preprocessor = ColumnTransformer(
    transformers=[
        ("categorical", ordinal_encoder, categorical_columns),
    ],
    remainder="passthrough",
)
gbrt = HistGradientBoostingRegressor()

# %%
# A simple scikit-learn approach is to pipeline the preprocessor
# and the HGBDT. However, we treat a sample by providing data only
# at time t, without any hint from previous samples.
gbrt_pipeline = make_pipeline(preprocessor, gbrt)
gbrt_pipeline.fit(X_train, y_train)
y_pred_regressor = gbrt_pipeline.predict(X_test)
y_pred_regressor = pd.Series(y_pred_regressor, index=y_test.index)

# %%
# Now, we would like to have the same type of model that would take
# into accound past samples. It seems that sktime is not good at
# taking into account heterogeneous data and therefore we have to
# manage everything by hand. One might wants to check the
# ForecasterPipeline in case that it could be used instead.
X_train_trans = preprocessor.fit_transform(X_train)
X_test_trans = preprocessor.transform(X_test)

# %%
# Recreate some dataset with the right indices.
X_train_trans = pd.DataFrame(
    X_train_trans, index=X_train.index, columns=X_train.columns,
)
X_test_trans = pd.DataFrame(
    X_test_trans, index=X_test.index, columns=X_test.columns,
)

# %%
# We can create a forecaster from the scikit-learn pipeline. We
# need to use make_reduction and specify the window_length to create
# internally a new dataframe that will take the past samples into
# account.
#
# The recursive strategy means that we will fit a single model to predict
# every samples in the horizon. We could have more complex strategy to
# use a particular model for each horizon sample.
from sktime.forecasting.compose import make_reduction

forecaster = make_reduction(
    gbrt, window_length=15, strategy="recursive", scitype="tabular-regressor"
)


# %%
# Plotting the data, we could see that there is a general trend that
# bike rental is generally increasing. We will take into account this
# general trend by preprocessing the target.
from sktime.transformations.series.detrend import Detrender
from sktime.forecasting.trend import PolynomialTrendForecaster

detrender = Detrender(PolynomialTrendForecaster(degree=1))

# %%
# The TransformedTargetForecaster has a weird Pipeline API.
from sktime.forecasting.compose import TransformedTargetForecaster

forecaster_detrended = TransformedTargetForecaster(
    steps=[
        ("detrender", detrender),
        ("forecaster", forecaster),
    ]
)

# %%
# The forecaster will not support ot have data with missing entry.
# This is probably wrong but we will resample to have a regularly
# spaced samples and use a forward fill.
forecaster_detrended.fit(
    y=y_train.resample("H").ffill(),
    X=X_train_trans.resample("H").ffill())

# %%
# We do the same to get the predictions.
y_pred_forecasting = forecaster_detrended.predict(
    fh, X=X_test_trans.resample("H").ffill()
)

# %%
# Finally, let's make a plot by resampling by day in order to see
# something on the plot.
import matplotlib.pyplot as plt

_, ax = plt.subplots()
y_test.resample("D").mean().plot(ax=ax, label="True targets")
y_pred_forecasting.resample("D").mean().plot(ax=ax, label="Forecaster")
y_pred_regressor.resample("D").mean().plot(ax=ax, label="Regressor")
ax.set_ylabel("Target")
ax.legend()
ax.set_title("Comparison of a regressor and a forecaster")

# %%
	# %%
	# Download the original dataset to be able to easily build an index with the
	# original datetime.
	# The dataset is available at:
	# https://archive.ics.uci.edu/ml/machine-learning-databases/00275/Bike-Sharing-Dataset.zip
	import pandas as pd

	df_external = pd.read_csv(
	"~/Downloads/Bike-Sharing-Dataset/hour.csv",
	index_col=0,
	)
	df_external.head()

	# %%
	dteday = pd.to_datetime(df_external["dteday"])

	# %%
	time_index = pd.to_datetime(
	{
	"year": dteday.dt.year,
	"month": dteday.dt.month,
	"day": dteday.dt.day,
	"hour": df_external["hr"],
	}
	)

	# %%
	# We create the same dataset than in the original example:
	# https://scikit-learn.org/dev/auto_examples/applications/plot_cyclical_feature_engineering.html#sphx-glr-auto-examples-applications-plot-cyclical-feature-engineering-py
	from sklearn.datasets import fetch_openml

	bike_sharing = fetch_openml("Bike_Sharing_Demand", version=2, as_frame=True)
	df = bike_sharing.frame

	# %%
	# To get a quick understanding of the periodic patterns of the data, let us
	# have a look at the average demand per hour during a week.
	#
	# Note that the week starts on a Sunday, during the weekend. We can clearly
	# distinguish the commute patterns in the morning and evenings of the work days
	# and the leisure use of the bikes on the weekends with a more spread peak
	# demand around the middle of the days:
	import matplotlib.pyplot as plt


	fig, ax = plt.subplots(figsize=(12, 4))
	average_week_demand = df.groupby(["weekday", "hour"]).mean()["count"]
	average_week_demand.plot(ax=ax)
	_ = ax.set(
	title="Average hourly bike demand during the week",
	xticks=[i * 24 for i in range(7)],
	xticklabels=["Sun", "Mon", "Tue", "Wed", "Thu", "Fri", "Sat"],
	xlabel="Time of the week",
	ylabel="Number of bike rentals",
	)

	# %%
	#
	# The target of the prediction problem is the absolute count of bike rentals on
	# a hourly basis:
	df["count"].max()

	# %%
	#
	# Let us rescale the target variable (number of hourly bike rentals) to predict
	# a relative demand so that the mean absolute error is more easily interpreted
	# as a fraction of the maximum demand.
	#
	# .. note::
	#
	# The fit method of the models used in this notebook all minimize the
	# mean squared error to estimate the conditional mean instead of the mean
	# absolute error that would fit an estimator of the conditional median.
	#
	# When reporting performance measure on the test set in the discussion, we
	# instead choose to focus on the mean absolute error that is more
	# intuitive than the (root) mean squared error. Note, however, that the
	# best models for one metric are also the best for the other in this
	# study.
	y = df["count"] / 1000

	# %%
	fig, ax = plt.subplots(figsize=(12, 4))
	y.hist(bins=30, ax=ax)
	_ = ax.set(
	xlabel="Fraction of rented fleet demand",
	ylabel="Number of hours",
	)

	# %%
	# The input feature data frame is a time annotated hourly log of variables
	# describing the weather conditions. It includes both numerical and categorical
	# variables. Note that the time information has already been expanded into
	# several complementary columns.
	#
	X = df.drop("count", axis="columns")
	X

	# %%
	# .. note::
	#
	# If the time information was only present as a date or datetime column, we
	# could have expanded it into hour-in-the-day, day-in-the-week,
	# day-in-the-month, month-in-the-year using pandas:
	# https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#time-date-components
	#
	# We now introspect the distribution of the categorical variables, starting
	# with `"weather"`:
	#
	X["weather"].value_counts()

	# %%
	# Since there are only 3 `"heavy_rain"` events, we cannot use this category to
	# train machine learning models with cross validation. Instead, we simplify the
	# representation by collapsing those into the `"rain"` category.
	#
	X["weather"].replace(to_replace="heavy_rain", value="rain", inplace=True)
	# %%
	X["weather"].value_counts()

	# %%
	X.tail()

	# %%
	X.index = pd.Index(time_index).to_period("H")

	# %%
	X.head()

	# %%
	y.index = pd.Index(time_index).to_period("H")

	# %%
	# From now on, we will try to use sktime to to process the data
	# as a forecasting problem. Indeed, we want a model that will
	# augment the data by addind information from past samples.
	#
	# First, let's split the data into 2 sets: 2010 that will be used
	# for training and 2011 that will be used for testing.
	from sktime.forecasting.model_selection import temporal_train_test_split

	y_train, y_test, X_train, X_test = temporal_train_test_split(
	y, X, test_size=0.5
	)

	# %%
	# We will start by creating a dummy forecaster that will predict the
	# latest values seen in the time series.
	from sktime.forecasting.naive import NaiveForecaster

	forecaster = NaiveForecaster()
	forecaster.fit(y_train)

	# %%
	# The way of doing prediction is a not straightforward. One needs to
	# create a ForecastHorizon object that take the time index for which
	# one wants the prediction. This will enforce your time index to be
	# a regularly spaced period.
	from sktime.forecasting.base import ForecastingHorizon

	fh = ForecastingHorizon(
	y_test.index, is_relative=False
	)

	forecaster.predict(fh)

	# %%
	# Now, let's create a preprocessor to handle the heterogeneous data
	# in X. We also create an HGBRT as a predictor.
	from sklearn.pipeline import make_pipeline
	from sklearn.preprocessing import OrdinalEncoder
	from sklearn.compose import ColumnTransformer
	from sklearn.ensemble import HistGradientBoostingRegressor

	categorical_columns = [
	"weather",
	"season",
	"holiday",
	"workingday",
	]
	categories = [
	["clear", "misty", "rain"],
	["spring", "summer", "fall", "winter"],
	["False", "True"],
	["False", "True"],
	]
	ordinal_encoder = OrdinalEncoder(categories=categories)
	preprocessor = ColumnTransformer(
	transformers=[
	("categorical", ordinal_encoder, categorical_columns),
	],
	remainder="passthrough",
	)
	gbrt = HistGradientBoostingRegressor()

	# %%
	# A simple scikit-learn approach is to pipeline the preprocessor
	# and the HGBDT. However, we treat a sample by providing data only
	# at time t, without any hint from previous samples.
	gbrt_pipeline = make_pipeline(preprocessor, gbrt)
	gbrt_pipeline.fit(X_train, y_train)
	y_pred_regressor = gbrt_pipeline.predict(X_test)
	y_pred_regressor = pd.Series(y_pred_regressor, index=y_test.index)

	# %%
	# Now, we would like to have the same type of model that would take
	# into accound past samples. It seems that sktime is not good at
	# taking into account heterogeneous data and therefore we have to
	# manage everything by hand. One might wants to check the
	# ForecasterPipeline in case that it could be used instead.
	X_train_trans = preprocessor.fit_transform(X_train)
	X_test_trans = preprocessor.transform(X_test)

	# %%
	# Recreate some dataset with the right indices.
	X_train_trans = pd.DataFrame(
	X_train_trans, index=X_train.index, columns=X_train.columns,
	)
	X_test_trans = pd.DataFrame(
	X_test_trans, index=X_test.index, columns=X_test.columns,
	)

	# %%
	# We can create a forecaster from the scikit-learn pipeline. We
	# need to use make_reduction and specify the window_length to create
	# internally a new dataframe that will take the past samples into
	# account.
	#
	# The recursive strategy means that we will fit a single model to predict
	# every samples in the horizon. We could have more complex strategy to
	# use a particular model for each horizon sample.
	from sktime.forecasting.compose import make_reduction

	forecaster = make_reduction(
	gbrt, window_length=15, strategy="recursive", scitype="tabular-regressor"
	)


	# %%
	# Plotting the data, we could see that there is a general trend that
	# bike rental is generally increasing. We will take into account this
	# general trend by preprocessing the target.
	from sktime.transformations.series.detrend import Detrender
	from sktime.forecasting.trend import PolynomialTrendForecaster

	detrender = Detrender(PolynomialTrendForecaster(degree=1))

	# %%
	# The TransformedTargetForecaster has a weird Pipeline API.
	from sktime.forecasting.compose import TransformedTargetForecaster

	forecaster_detrended = TransformedTargetForecaster(
	steps=[
	("detrender", detrender),
	("forecaster", forecaster),
	]
	)

	# %%
	# The forecaster will not support ot have data with missing entry.
	# This is probably wrong but we will resample to have a regularly
	# spaced samples and use a forward fill.
	forecaster_detrended.fit(
	y=y_train.resample("H").ffill(),
	X=X_train_trans.resample("H").ffill())

	# %%
	# We do the same to get the predictions.
	y_pred_forecasting = forecaster_detrended.predict(
	fh, X=X_test_trans.resample("H").ffill()
	)

	# %%
	# Finally, let's make a plot by resampling by day in order to see
	# something on the plot.
	import matplotlib.pyplot as plt

	_, ax = plt.subplots()
	y_test.resample("D").mean().plot(ax=ax, label="True targets")
	y_pred_forecasting.resample("D").mean().plot(ax=ax, label="Forecaster")
	y_pred_regressor.resample("D").mean().plot(ax=ax, label="Regressor")
	ax.set_ylabel("Target")
	ax.legend()
	ax.set_title("Comparison of a regressor and a forecaster")

	# %%