Skip to content

Instantly share code, notes, and snippets.

@alberto-santini
Last active February 18, 2022 09:59
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save alberto-santini/d2c7572b499dea124a0160406fcf2e40 to your computer and use it in GitHub Desktop.
Save alberto-santini/d2c7572b499dea124a0160406fcf2e40 to your computer and use it in GitHub Desktop.
Error when using ColumnTransformer twice
from sklearn.datasets import fetch_openml
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from mlxtend.regressor import StackingRegressor
import numpy as np
import pandas as pd
# We use the Ames house prices dataset for this example
d = fetch_openml('house_prices', as_frame=True).frame
# Small data preprocessing:
for column in d.columns:
if d[column].dtype == object or column == 'MSSubClass':
d[column] = pd.Categorical(d[column])
d.drop(columns='Id', inplace=True)
# Inner linear regressor:
# 1. 1-hot encode categorical features
# 2. Standardise numerical features
# 3. Perform Linear Regression
sr_linear = Pipeline(steps=[
('preprocessing', ColumnTransformer(transformers=[
('categorical',
make_pipeline(OneHotEncoder(), StandardScaler()),
make_column_selector(dtype_include='category')),
('numerical',
StandardScaler(),
make_column_selector(dtype_include=np.number))
])),
('model', LinearRegression())
])
# Just a decision regression tree...
# Because it supports categorical features natively, there is
# need to 1-hot-encode them. There is also no benefit in
# standardising numerical features.
sr_tree = DecisionTreeRegressor()
# Column transformer which performs imputation.
# Categorical columns which contain NaN get a 'None' value.
# Numerical columns which contain NaN get imputed with the median.
ct_imputation = ColumnTransformer(transformers=[
('categorical',
SimpleImputer(strategy='constant', fill_value='None'),
make_column_selector(dtype_include='category')),
('numerical',
SimpleImputer(strategy='median'),
make_column_selector(dtype_include=np.number))
])
# Pipeline for a stacked regressor:
# 1. Perform imputation via the ColumnTransformer
# 2. Because ColumnTransformer returns a numpy array, but the preprocessing steps
# of the inner regressors needa pandas DataFrame, we convert the numpy array
# back to a DataFrame. We can use get_feature_names_out() because we are using
# the nightly version of scikit-learn: the current stable version 1.0.2 does not
# support it. We can install the nightly version with just one pip command:
# https://scikit-learn.org/stable/developers/advanced_installation.html
# 3. Apply the StackingRegressor
stacked_regressor = Pipeline(steps=[
('imputation', ct_imputation),
('back_to_pandas', FunctionTransformer(
func=lambda values: pd.DataFrame(values, columns=ct_imputation.get_feature_names_out())
)),
('model', StackingRegressor(
regressors=[sr_linear, sr_tree],
meta_regressor=DecisionTreeRegressor(),
use_features_in_secondary=True
))
])
# Prepare the data for training
label = 'SalePrice'
features = [col for col in d.columns if col != label]
X, y = d[features], d[label]
# Train the stacked regressor
stacked_regressor.fit(X_train, y_train)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True)
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
Input In [37], in <module>
----> 1 stacked_regressor.fit(X_train, y_train)
File ~/.conda/envs/kaggle/lib/python3.8/site-packages/sklearn/pipeline.py:394, in Pipeline.fit(self, X, y, **fit_params)
392 if self._final_estimator != "passthrough":
393 fit_params_last_step = fit_params_steps[self.steps[-1][0]]
--> 394 self._final_estimator.fit(Xt, y, **fit_params_last_step)
396 return self
File ~/.conda/envs/kaggle/lib/python3.8/site-packages/mlxtend/regressor/stacking_regression.py:168, in StackingRegressor.fit(self, X, y, sample_weight)
165 print(_name_estimators((regr,))[0][1])
167 if sample_weight is None:
--> 168 regr.fit(X, y)
169 else:
170 regr.fit(X, y, sample_weight=sample_weight)
File ~/.conda/envs/kaggle/lib/python3.8/site-packages/sklearn/pipeline.py:390, in Pipeline.fit(self, X, y, **fit_params)
364 """Fit the model.
365
366 Fit all the transformers one after the other and transform the
(...)
387 Pipeline with fitted steps.
388 """
389 fit_params_steps = self._check_fit_params(**fit_params)
--> 390 Xt = self._fit(X, y, **fit_params_steps)
391 with _print_elapsed_time("Pipeline", self._log_message(len(self.steps) - 1)):
392 if self._final_estimator != "passthrough":
File ~/.conda/envs/kaggle/lib/python3.8/site-packages/sklearn/pipeline.py:348, in Pipeline._fit(self, X, y, **fit_params_steps)
346 cloned_transformer = clone(transformer)
347 # Fit or load from cache the current transformer
--> 348 X, fitted_transformer = fit_transform_one_cached(
349 cloned_transformer,
350 X,
351 y,
352 None,
353 message_clsname="Pipeline",
354 message=self._log_message(step_idx),
355 **fit_params_steps[name],
356 )
357 # Replace the transformer of the step with the fitted
358 # transformer. This is necessary when loading the transformer
359 # from the cache.
360 self.steps[step_idx] = (name, fitted_transformer)
File ~/.conda/envs/kaggle/lib/python3.8/site-packages/joblib/memory.py:349, in NotMemorizedFunc.__call__(self, *args, **kwargs)
348 def __call__(self, *args, **kwargs):
--> 349 return self.func(*args, **kwargs)
File ~/.conda/envs/kaggle/lib/python3.8/site-packages/sklearn/pipeline.py:893, in _fit_transform_one(transformer, X, y, weight, message_clsname, message, **fit_params)
891 with _print_elapsed_time(message_clsname, message):
892 if hasattr(transformer, "fit_transform"):
--> 893 res = transformer.fit_transform(X, y, **fit_params)
894 else:
895 res = transformer.fit(X, y, **fit_params).transform(X)
File ~/.conda/envs/kaggle/lib/python3.8/site-packages/sklearn/compose/_column_transformer.py:672, in ColumnTransformer.fit_transform(self, X, y)
670 self._check_n_features(X, reset=True)
671 self._validate_transformers()
--> 672 self._validate_column_callables(X)
673 self._validate_remainder(X)
675 result = self._fit_transform(X, y, _fit_transform_one)
File ~/.conda/envs/kaggle/lib/python3.8/site-packages/sklearn/compose/_column_transformer.py:350, in ColumnTransformer._validate_column_callables(self, X)
348 for name, _, columns in self.transformers:
349 if callable(columns):
--> 350 columns = columns(X)
351 all_columns.append(columns)
352 transformer_to_input_indices[name] = _get_column_indices(X, columns)
File ~/.conda/envs/kaggle/lib/python3.8/site-packages/sklearn/compose/_column_transformer.py:1036, in make_column_selector.__call__(self, df)
1027 """Callable for column selection to be used by a
1028 :class:`ColumnTransformer`.
1029
(...)
1033 DataFrame to select columns from.
1034 """
1035 if not hasattr(df, "iloc"):
-> 1036 raise ValueError(
1037 "make_column_selector can only be applied to pandas dataframes"
1038 )
1039 df_row = df.iloc[:1]
1040 if self.dtype_include is not None or self.dtype_exclude is not None:
ValueError: make_column_selector can only be applied to pandas dataframes
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment