Skip to content

Instantly share code, notes, and snippets.

@glemaitre
Created September 22, 2021 19:18
Show Gist options
  • Save glemaitre/1649fe00f38a803e89060e9810400b52 to your computer and use it in GitHub Desktop.
Save glemaitre/1649fe00f38a803e89060e9810400b52 to your computer and use it in GitHub Desktop.
# %%
# Download the original dataset to be able to easily build an index with the
# original datetime.
# The dataset is available at:
# https://archive.ics.uci.edu/ml/machine-learning-databases/00275/Bike-Sharing-Dataset.zip
import pandas as pd
df_external = pd.read_csv(
"~/Downloads/Bike-Sharing-Dataset/hour.csv",
index_col=0,
)
df_external.head()
# %%
dteday = pd.to_datetime(df_external["dteday"])
# %%
time_index = pd.to_datetime(
{
"year": dteday.dt.year,
"month": dteday.dt.month,
"day": dteday.dt.day,
"hour": df_external["hr"],
}
)
# %%
# We create the same dataset than in the original example:
# https://scikit-learn.org/dev/auto_examples/applications/plot_cyclical_feature_engineering.html#sphx-glr-auto-examples-applications-plot-cyclical-feature-engineering-py
from sklearn.datasets import fetch_openml
bike_sharing = fetch_openml("Bike_Sharing_Demand", version=2, as_frame=True)
df = bike_sharing.frame
# %%
# To get a quick understanding of the periodic patterns of the data, let us
# have a look at the average demand per hour during a week.
#
# Note that the week starts on a Sunday, during the weekend. We can clearly
# distinguish the commute patterns in the morning and evenings of the work days
# and the leisure use of the bikes on the weekends with a more spread peak
# demand around the middle of the days:
import matplotlib.pyplot as plt
fig, ax = plt.subplots(figsize=(12, 4))
average_week_demand = df.groupby(["weekday", "hour"]).mean()["count"]
average_week_demand.plot(ax=ax)
_ = ax.set(
title="Average hourly bike demand during the week",
xticks=[i * 24 for i in range(7)],
xticklabels=["Sun", "Mon", "Tue", "Wed", "Thu", "Fri", "Sat"],
xlabel="Time of the week",
ylabel="Number of bike rentals",
)
# %%
#
# The target of the prediction problem is the absolute count of bike rentals on
# a hourly basis:
df["count"].max()
# %%
#
# Let us rescale the target variable (number of hourly bike rentals) to predict
# a relative demand so that the mean absolute error is more easily interpreted
# as a fraction of the maximum demand.
#
# .. note::
#
# The fit method of the models used in this notebook all minimize the
# mean squared error to estimate the conditional mean instead of the mean
# absolute error that would fit an estimator of the conditional median.
#
# When reporting performance measure on the test set in the discussion, we
# instead choose to focus on the mean absolute error that is more
# intuitive than the (root) mean squared error. Note, however, that the
# best models for one metric are also the best for the other in this
# study.
y = df["count"] / 1000
# %%
fig, ax = plt.subplots(figsize=(12, 4))
y.hist(bins=30, ax=ax)
_ = ax.set(
xlabel="Fraction of rented fleet demand",
ylabel="Number of hours",
)
# %%
# The input feature data frame is a time annotated hourly log of variables
# describing the weather conditions. It includes both numerical and categorical
# variables. Note that the time information has already been expanded into
# several complementary columns.
#
X = df.drop("count", axis="columns")
X
# %%
# .. note::
#
# If the time information was only present as a date or datetime column, we
# could have expanded it into hour-in-the-day, day-in-the-week,
# day-in-the-month, month-in-the-year using pandas:
# https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#time-date-components
#
# We now introspect the distribution of the categorical variables, starting
# with `"weather"`:
#
X["weather"].value_counts()
# %%
# Since there are only 3 `"heavy_rain"` events, we cannot use this category to
# train machine learning models with cross validation. Instead, we simplify the
# representation by collapsing those into the `"rain"` category.
#
X["weather"].replace(to_replace="heavy_rain", value="rain", inplace=True)
# %%
X["weather"].value_counts()
# %%
X.tail()
# %%
X.index = pd.Index(time_index).to_period("H")
# %%
X.head()
# %%
y.index = pd.Index(time_index).to_period("H")
# %%
# From now on, we will try to use sktime to to process the data
# as a forecasting problem. Indeed, we want a model that will
# augment the data by addind information from past samples.
#
# First, let's split the data into 2 sets: 2010 that will be used
# for training and 2011 that will be used for testing.
from sktime.forecasting.model_selection import temporal_train_test_split
y_train, y_test, X_train, X_test = temporal_train_test_split(
y, X, test_size=0.5
)
# %%
# We will start by creating a dummy forecaster that will predict the
# latest values seen in the time series.
from sktime.forecasting.naive import NaiveForecaster
forecaster = NaiveForecaster()
forecaster.fit(y_train)
# %%
# The way of doing prediction is a not straightforward. One needs to
# create a ForecastHorizon object that take the time index for which
# one wants the prediction. This will enforce your time index to be
# a regularly spaced period.
from sktime.forecasting.base import ForecastingHorizon
fh = ForecastingHorizon(
y_test.index, is_relative=False
)
forecaster.predict(fh)
# %%
# Now, let's create a preprocessor to handle the heterogeneous data
# in X. We also create an HGBRT as a predictor.
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import HistGradientBoostingRegressor
categorical_columns = [
"weather",
"season",
"holiday",
"workingday",
]
categories = [
["clear", "misty", "rain"],
["spring", "summer", "fall", "winter"],
["False", "True"],
["False", "True"],
]
ordinal_encoder = OrdinalEncoder(categories=categories)
preprocessor = ColumnTransformer(
transformers=[
("categorical", ordinal_encoder, categorical_columns),
],
remainder="passthrough",
)
gbrt = HistGradientBoostingRegressor()
# %%
# A simple scikit-learn approach is to pipeline the preprocessor
# and the HGBDT. However, we treat a sample by providing data only
# at time t, without any hint from previous samples.
gbrt_pipeline = make_pipeline(preprocessor, gbrt)
gbrt_pipeline.fit(X_train, y_train)
y_pred_regressor = gbrt_pipeline.predict(X_test)
y_pred_regressor = pd.Series(y_pred_regressor, index=y_test.index)
# %%
# Now, we would like to have the same type of model that would take
# into accound past samples. It seems that sktime is not good at
# taking into account heterogeneous data and therefore we have to
# manage everything by hand. One might wants to check the
# ForecasterPipeline in case that it could be used instead.
X_train_trans = preprocessor.fit_transform(X_train)
X_test_trans = preprocessor.transform(X_test)
# %%
# Recreate some dataset with the right indices.
X_train_trans = pd.DataFrame(
X_train_trans, index=X_train.index, columns=X_train.columns,
)
X_test_trans = pd.DataFrame(
X_test_trans, index=X_test.index, columns=X_test.columns,
)
# %%
# We can create a forecaster from the scikit-learn pipeline. We
# need to use make_reduction and specify the window_length to create
# internally a new dataframe that will take the past samples into
# account.
#
# The recursive strategy means that we will fit a single model to predict
# every samples in the horizon. We could have more complex strategy to
# use a particular model for each horizon sample.
from sktime.forecasting.compose import make_reduction
forecaster = make_reduction(
gbrt, window_length=15, strategy="recursive", scitype="tabular-regressor"
)
# %%
# Plotting the data, we could see that there is a general trend that
# bike rental is generally increasing. We will take into account this
# general trend by preprocessing the target.
from sktime.transformations.series.detrend import Detrender
from sktime.forecasting.trend import PolynomialTrendForecaster
detrender = Detrender(PolynomialTrendForecaster(degree=1))
# %%
# The TransformedTargetForecaster has a weird Pipeline API.
from sktime.forecasting.compose import TransformedTargetForecaster
forecaster_detrended = TransformedTargetForecaster(
steps=[
("detrender", detrender),
("forecaster", forecaster),
]
)
# %%
# The forecaster will not support ot have data with missing entry.
# This is probably wrong but we will resample to have a regularly
# spaced samples and use a forward fill.
forecaster_detrended.fit(
y=y_train.resample("H").ffill(),
X=X_train_trans.resample("H").ffill())
# %%
# We do the same to get the predictions.
y_pred_forecasting = forecaster_detrended.predict(
fh, X=X_test_trans.resample("H").ffill()
)
# %%
# Finally, let's make a plot by resampling by day in order to see
# something on the plot.
import matplotlib.pyplot as plt
_, ax = plt.subplots()
y_test.resample("D").mean().plot(ax=ax, label="True targets")
y_pred_forecasting.resample("D").mean().plot(ax=ax, label="Forecaster")
y_pred_regressor.resample("D").mean().plot(ax=ax, label="Regressor")
ax.set_ylabel("Target")
ax.legend()
ax.set_title("Comparison of a regressor and a forecaster")
# %%
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment