alberto-santini/column_transformer_twice.py

## column_transformer_twice.py
from sklearn.datasets import fetch_openml
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split

from mlxtend.regressor import StackingRegressor

import numpy as np
import pandas as pd

# We use the Ames house prices dataset for this example
d = fetch_openml('house_prices', as_frame=True).frame

# Small data preprocessing:
for column in d.columns:
    if d[column].dtype == object or column == 'MSSubClass':
        d[column] = pd.Categorical(d[column])

d.drop(columns='Id', inplace=True)

# Inner linear regressor:
# 1. 1-hot encode categorical features
# 2. Standardise numerical features
# 3. Perform Linear Regression
sr_linear = Pipeline(steps=[
    ('preprocessing', ColumnTransformer(transformers=[
        ('categorical',
             make_pipeline(OneHotEncoder(), StandardScaler()),
             make_column_selector(dtype_include='category')),
        ('numerical',
             StandardScaler(),
             make_column_selector(dtype_include=np.number))
    ])),
    ('model', LinearRegression())
])

# Just a decision regression tree...
# Because it supports categorical features natively, there is
# need to 1-hot-encode them. There is also no benefit in
# standardising numerical features.
sr_tree = DecisionTreeRegressor()

# Column transformer which performs imputation.
# Categorical columns which contain NaN get a 'None' value.
# Numerical columns which contain NaN get imputed with the median.
ct_imputation = ColumnTransformer(transformers=[
    ('categorical',
        SimpleImputer(strategy='constant', fill_value='None'),
        make_column_selector(dtype_include='category')),
    ('numerical',
        SimpleImputer(strategy='median'),
        make_column_selector(dtype_include=np.number))
])

# Pipeline for a stacked regressor:
# 1. Perform imputation via the ColumnTransformer
# 2. Because ColumnTransformer returns a numpy array, but the preprocessing steps
#    of the inner regressors needa pandas DataFrame, we convert the numpy array
#    back to a DataFrame. We can use get_feature_names_out() because we are using
#    the nightly version of scikit-learn: the current stable version 1.0.2 does not
#    support it. We can install the nightly version with just one pip command:
#    https://scikit-learn.org/stable/developers/advanced_installation.html
# 3. Apply the StackingRegressor
stacked_regressor = Pipeline(steps=[
    ('imputation', ct_imputation),
    ('back_to_pandas', FunctionTransformer(
        func=lambda values: pd.DataFrame(values, columns=ct_imputation.get_feature_names_out())
    )),
    ('model', StackingRegressor(
        regressors=[sr_linear, sr_tree],
        meta_regressor=DecisionTreeRegressor(),
        use_features_in_secondary=True
    ))
])

# Prepare the data for training
label = 'SalePrice'
features = [col for col in d.columns if col != label]
X, y = d[features], d[label]

# Train the stacked regressor
stacked_regressor.fit(X_train, y_train)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True)

## error.txt
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
Input In [37], in <module>
----> 1 stacked_regressor.fit(X_train, y_train)

File ~/.conda/envs/kaggle/lib/python3.8/site-packages/sklearn/pipeline.py:394, in Pipeline.fit(self, X, y, **fit_params)
    392     if self._final_estimator != "passthrough":
    393         fit_params_last_step = fit_params_steps[self.steps[-1][0]]
--> 394         self._final_estimator.fit(Xt, y, **fit_params_last_step)
    396 return self

File ~/.conda/envs/kaggle/lib/python3.8/site-packages/mlxtend/regressor/stacking_regression.py:168, in StackingRegressor.fit(self, X, y, sample_weight)
    165     print(_name_estimators((regr,))[0][1])
    167 if sample_weight is None:
--> 168     regr.fit(X, y)
    169 else:
    170     regr.fit(X, y, sample_weight=sample_weight)

File ~/.conda/envs/kaggle/lib/python3.8/site-packages/sklearn/pipeline.py:390, in Pipeline.fit(self, X, y, **fit_params)
    364 """Fit the model.
    365
    366 Fit all the transformers one after the other and transform the
   (...)
    387     Pipeline with fitted steps.
    388 """
    389 fit_params_steps = self._check_fit_params(**fit_params)
--> 390 Xt = self._fit(X, y, **fit_params_steps)
    391 with _print_elapsed_time("Pipeline", self._log_message(len(self.steps) - 1)):
    392     if self._final_estimator != "passthrough":

File ~/.conda/envs/kaggle/lib/python3.8/site-packages/sklearn/pipeline.py:348, in Pipeline._fit(self, X, y, **fit_params_steps)
    346     cloned_transformer = clone(transformer)
    347 # Fit or load from cache the current transformer
--> 348 X, fitted_transformer = fit_transform_one_cached(
    349     cloned_transformer,
    350     X,
    351     y,
    352     None,
    353     message_clsname="Pipeline",
    354     message=self._log_message(step_idx),
    355     **fit_params_steps[name],
    356 )
    357 # Replace the transformer of the step with the fitted
    358 # transformer. This is necessary when loading the transformer
    359 # from the cache.
    360 self.steps[step_idx] = (name, fitted_transformer)

File ~/.conda/envs/kaggle/lib/python3.8/site-packages/joblib/memory.py:349, in NotMemorizedFunc.__call__(self, *args, **kwargs)
    348 def __call__(self, *args, **kwargs):
--> 349     return self.func(*args, **kwargs)

File ~/.conda/envs/kaggle/lib/python3.8/site-packages/sklearn/pipeline.py:893, in _fit_transform_one(transformer, X, y, weight, message_clsname, message, **fit_params)
    891 with _print_elapsed_time(message_clsname, message):
    892     if hasattr(transformer, "fit_transform"):
--> 893         res = transformer.fit_transform(X, y, **fit_params)
    894     else:
    895         res = transformer.fit(X, y, **fit_params).transform(X)

File ~/.conda/envs/kaggle/lib/python3.8/site-packages/sklearn/compose/_column_transformer.py:672, in ColumnTransformer.fit_transform(self, X, y)
    670 self._check_n_features(X, reset=True)
    671 self._validate_transformers()
--> 672 self._validate_column_callables(X)
    673 self._validate_remainder(X)
    675 result = self._fit_transform(X, y, _fit_transform_one)

File ~/.conda/envs/kaggle/lib/python3.8/site-packages/sklearn/compose/_column_transformer.py:350, in ColumnTransformer._validate_column_callables(self, X)
    348 for name, _, columns in self.transformers:
    349     if callable(columns):
--> 350         columns = columns(X)
    351     all_columns.append(columns)
    352     transformer_to_input_indices[name] = _get_column_indices(X, columns)

File ~/.conda/envs/kaggle/lib/python3.8/site-packages/sklearn/compose/_column_transformer.py:1036, in make_column_selector.__call__(self, df)
   1027 """Callable for column selection to be used by a
   1028 :class:`ColumnTransformer`.
   1029
   (...)
   1033     DataFrame to select columns from.
   1034 """
   1035 if not hasattr(df, "iloc"):
-> 1036     raise ValueError(
   1037         "make_column_selector can only be applied to pandas dataframes"
   1038     )
   1039 df_row = df.iloc[:1]
   1040 if self.dtype_include is not None or self.dtype_exclude is not None:

ValueError: make_column_selector can only be applied to pandas dataframes
	from sklearn.datasets import fetch_openml
	from sklearn.pipeline import Pipeline, make_pipeline
	from sklearn.compose import ColumnTransformer, make_column_selector
	from sklearn.impute import SimpleImputer
	from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
	from sklearn.linear_model import LinearRegression
	from sklearn.tree import DecisionTreeRegressor
	from sklearn.model_selection import train_test_split

	from mlxtend.regressor import StackingRegressor

	import numpy as np
	import pandas as pd

	# We use the Ames house prices dataset for this example
	d = fetch_openml('house_prices', as_frame=True).frame

	# Small data preprocessing:
	for column in d.columns:
	if d[column].dtype == object or column == 'MSSubClass':
	d[column] = pd.Categorical(d[column])

	d.drop(columns='Id', inplace=True)

	# Inner linear regressor:
	# 1. 1-hot encode categorical features
	# 2. Standardise numerical features
	# 3. Perform Linear Regression
	sr_linear = Pipeline(steps=[
	('preprocessing', ColumnTransformer(transformers=[
	('categorical',
	make_pipeline(OneHotEncoder(), StandardScaler()),
	make_column_selector(dtype_include='category')),
	('numerical',
	StandardScaler(),
	make_column_selector(dtype_include=np.number))
	])),
	('model', LinearRegression())
	])

	# Just a decision regression tree...
	# Because it supports categorical features natively, there is
	# need to 1-hot-encode them. There is also no benefit in
	# standardising numerical features.
	sr_tree = DecisionTreeRegressor()

	# Column transformer which performs imputation.
	# Categorical columns which contain NaN get a 'None' value.
	# Numerical columns which contain NaN get imputed with the median.
	ct_imputation = ColumnTransformer(transformers=[
	('categorical',
	SimpleImputer(strategy='constant', fill_value='None'),
	make_column_selector(dtype_include='category')),
	('numerical',
	SimpleImputer(strategy='median'),
	make_column_selector(dtype_include=np.number))
	])

	# Pipeline for a stacked regressor:
	# 1. Perform imputation via the ColumnTransformer
	# 2. Because ColumnTransformer returns a numpy array, but the preprocessing steps
	# of the inner regressors needa pandas DataFrame, we convert the numpy array
	# back to a DataFrame. We can use get_feature_names_out() because we are using
	# the nightly version of scikit-learn: the current stable version 1.0.2 does not
	# support it. We can install the nightly version with just one pip command:
	# https://scikit-learn.org/stable/developers/advanced_installation.html
	# 3. Apply the StackingRegressor
	stacked_regressor = Pipeline(steps=[
	('imputation', ct_imputation),
	('back_to_pandas', FunctionTransformer(
	func=lambda values: pd.DataFrame(values, columns=ct_imputation.get_feature_names_out())
	)),
	('model', StackingRegressor(
	regressors=[sr_linear, sr_tree],
	meta_regressor=DecisionTreeRegressor(),
	use_features_in_secondary=True
	))
	])

	# Prepare the data for training
	label = 'SalePrice'
	features = [col for col in d.columns if col != label]
	X, y = d[features], d[label]

	# Train the stacked regressor
	stacked_regressor.fit(X_train, y_train)
	X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True)
	---------------------------------------------------------------------------
	ValueError Traceback (most recent call last)
	Input In [37], in <module>
	----> 1 stacked_regressor.fit(X_train, y_train)

	File ~/.conda/envs/kaggle/lib/python3.8/site-packages/sklearn/pipeline.py:394, in Pipeline.fit(self, X, y, **fit_params)
	392 if self._final_estimator != "passthrough":
	393 fit_params_last_step = fit_params_steps[self.steps[-1][0]]
	--> 394 self._final_estimator.fit(Xt, y, **fit_params_last_step)
	396 return self

	File ~/.conda/envs/kaggle/lib/python3.8/site-packages/mlxtend/regressor/stacking_regression.py:168, in StackingRegressor.fit(self, X, y, sample_weight)
	165 print(_name_estimators((regr,))[0][1])
	167 if sample_weight is None:
	--> 168 regr.fit(X, y)
	169 else:
	170 regr.fit(X, y, sample_weight=sample_weight)

	File ~/.conda/envs/kaggle/lib/python3.8/site-packages/sklearn/pipeline.py:390, in Pipeline.fit(self, X, y, **fit_params)
	364 """Fit the model.
	365
	366 Fit all the transformers one after the other and transform the
	(...)
	387 Pipeline with fitted steps.
	388 """
	389 fit_params_steps = self._check_fit_params(**fit_params)
	--> 390 Xt = self._fit(X, y, **fit_params_steps)
	391 with _print_elapsed_time("Pipeline", self._log_message(len(self.steps) - 1)):
	392 if self._final_estimator != "passthrough":

	File ~/.conda/envs/kaggle/lib/python3.8/site-packages/sklearn/pipeline.py:348, in Pipeline._fit(self, X, y, **fit_params_steps)
	346 cloned_transformer = clone(transformer)
	347 # Fit or load from cache the current transformer
	--> 348 X, fitted_transformer = fit_transform_one_cached(
	349 cloned_transformer,
	350 X,
	351 y,
	352 None,
	353 message_clsname="Pipeline",
	354 message=self._log_message(step_idx),
	355 **fit_params_steps[name],
	356 )
	357 # Replace the transformer of the step with the fitted
	358 # transformer. This is necessary when loading the transformer
	359 # from the cache.
	360 self.steps[step_idx] = (name, fitted_transformer)

	File ~/.conda/envs/kaggle/lib/python3.8/site-packages/joblib/memory.py:349, in NotMemorizedFunc.__call__(self, args, *kwargs)
	348 def __call__(self, args, *kwargs):
	--> 349 return self.func(args, *kwargs)

	File ~/.conda/envs/kaggle/lib/python3.8/site-packages/sklearn/pipeline.py:893, in _fit_transform_one(transformer, X, y, weight, message_clsname, message, **fit_params)
	891 with _print_elapsed_time(message_clsname, message):
	892 if hasattr(transformer, "fit_transform"):
	--> 893 res = transformer.fit_transform(X, y, **fit_params)
	894 else:
	895 res = transformer.fit(X, y, **fit_params).transform(X)

	File ~/.conda/envs/kaggle/lib/python3.8/site-packages/sklearn/compose/_column_transformer.py:672, in ColumnTransformer.fit_transform(self, X, y)
	670 self._check_n_features(X, reset=True)
	671 self._validate_transformers()
	--> 672 self._validate_column_callables(X)
	673 self._validate_remainder(X)
	675 result = self._fit_transform(X, y, _fit_transform_one)

	File ~/.conda/envs/kaggle/lib/python3.8/site-packages/sklearn/compose/_column_transformer.py:350, in ColumnTransformer._validate_column_callables(self, X)
	348 for name, _, columns in self.transformers:
	349 if callable(columns):
	--> 350 columns = columns(X)
	351 all_columns.append(columns)
	352 transformer_to_input_indices[name] = _get_column_indices(X, columns)

	File ~/.conda/envs/kaggle/lib/python3.8/site-packages/sklearn/compose/_column_transformer.py:1036, in make_column_selector.__call__(self, df)
	1027 """Callable for column selection to be used by a
	1028 :class:`ColumnTransformer`.
	1029
	(...)
	1033 DataFrame to select columns from.
	1034 """
	1035 if not hasattr(df, "iloc"):
	-> 1036 raise ValueError(
	1037 "make_column_selector can only be applied to pandas dataframes"
	1038 )
	1039 df_row = df.iloc[:1]
	1040 if self.dtype_include is not None or self.dtype_exclude is not None:

	ValueError: make_column_selector can only be applied to pandas dataframes