Skip to content

Instantly share code, notes, and snippets.

@jnothman
Last active April 20, 2017 01:50
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jnothman/bb1608e6ffea3109ff2ce7c926b0e0cb to your computer and use it in GitHub Desktop.
Save jnothman/bb1608e6ffea3109ff2ce7c926b0e0cb to your computer and use it in GitHub Desktop.
Creates transform_feature_names singledispatch allowing feature names to be calculated in a pipeline
from singledispatch import singledispatch
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_selection.base import SelectorMixin
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import FunctionTransformer
#from eli5 import explain_weights
#@explain_weights.register(Pipeline)
#def explain_weights_pipeline(estimator, feature_names=None, **kwargs):
# last_estimator = estimator.steps[-1][1]
# transform_pipeline = Pipeline(estimator.steps[:-1])
# feature_names = transform_feature_names(transform_pipeline, feature_names)
# out = explain_weights(last_estimator, feature_names=feature_names)
# out.estimator = repr(estimator)
# return out
@singledispatch
def transform_feature_names(transformer, in_names=None):
if hasattr(transformer, 'get_feature_names'):
return transformer.get_feature_names()
raise NotImplementedError('transform_feature_names not available for '
'{}'.format(transformer))
@transform_feature_names.register(Pipeline)
def _pipeline_names(est, in_names=None):
names = in_names
for name, trans in est.steps:
if trans is not None:
names = transform_feature_names(trans, names)
return names
@transform_feature_names.register(FeatureUnion)
def _union_names(est, in_names=None):
return ['{}:{}'.format(trans_name, feat_name)
for trans_name, trans, _ in est._iter()
for feat_name in transform_feature_names(trans, in_names)]
@transform_feature_names.register(SelectorMixin)
def _select_names(est, in_names=None):
return [in_names[i] for i in est.get_support(indices=True)]
def _formatted_names(fmt):
def transform_names(self, in_names=None):
return [fmt.format(name) for name in in_names]
return transform_names
def _component_names(fmt, attr):
def transform_names(self, in_names=None):
return [fmt.format(i) for i in range(getattr(self, attr))]
return transform_names
transform_feature_names.register(TfidfTransformer)(_formatted_names('tfidf({})'))
transform_feature_names.register(Imputer)(_formatted_names('impute({})'))
transform_feature_names.register(LatentDirichletAllocation)(_component_names('topic({})', 'n_topics'))
class FunctionTransformer(FunctionTransformer):
def __init__(self, func=None, inverse_func=None,
feature_name_func=None,
validate=True,
accept_sparse=False, pass_y=False,
kw_args=None, inv_kw_args=None):
super(type(self), self).__init__(
func=func, inverse_func=inverse_func, validate=validate,
accept_sparse=accept_sparse, pass_y=pass_y,
kw_args=kw_args, inv_kw_args=inv_kw_args)
self.feature_name_func = feature_name_func
@transform_feature_names.register(FunctionTransformer)
def _function_transformer_names(est, in_names=None):
return est.feature_name_func(in_names)
@mrlevitas
Copy link

I'm writing an estimator that just selects a subset from the input and am having get_feature_names() not being implemented in this intermediate step being a problem.
My implementation strongly resembles this:
http://stackoverflow.com/questions/42479370/getting-feature-names-from-within-a-featureunion-pipeline

Do you have any advice on using your method to patch a featureUnion with pipelines?

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment