Last active
April 20, 2017 01:50
-
-
Save jnothman/bb1608e6ffea3109ff2ce7c926b0e0cb to your computer and use it in GitHub Desktop.
Creates transform_feature_names singledispatch allowing feature names to be calculated in a pipeline
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from singledispatch import singledispatch | |
from sklearn.pipeline import Pipeline, FeatureUnion | |
from sklearn.feature_selection.base import SelectorMixin | |
from sklearn.feature_extraction.text import TfidfTransformer | |
from sklearn.decomposition import LatentDirichletAllocation | |
from sklearn.preprocessing import Imputer | |
from sklearn.preprocessing import FunctionTransformer | |
#from eli5 import explain_weights | |
#@explain_weights.register(Pipeline) | |
#def explain_weights_pipeline(estimator, feature_names=None, **kwargs): | |
# last_estimator = estimator.steps[-1][1] | |
# transform_pipeline = Pipeline(estimator.steps[:-1]) | |
# feature_names = transform_feature_names(transform_pipeline, feature_names) | |
# out = explain_weights(last_estimator, feature_names=feature_names) | |
# out.estimator = repr(estimator) | |
# return out | |
@singledispatch | |
def transform_feature_names(transformer, in_names=None): | |
if hasattr(transformer, 'get_feature_names'): | |
return transformer.get_feature_names() | |
raise NotImplementedError('transform_feature_names not available for ' | |
'{}'.format(transformer)) | |
@transform_feature_names.register(Pipeline) | |
def _pipeline_names(est, in_names=None): | |
names = in_names | |
for name, trans in est.steps: | |
if trans is not None: | |
names = transform_feature_names(trans, names) | |
return names | |
@transform_feature_names.register(FeatureUnion) | |
def _union_names(est, in_names=None): | |
return ['{}:{}'.format(trans_name, feat_name) | |
for trans_name, trans, _ in est._iter() | |
for feat_name in transform_feature_names(trans, in_names)] | |
@transform_feature_names.register(SelectorMixin) | |
def _select_names(est, in_names=None): | |
return [in_names[i] for i in est.get_support(indices=True)] | |
def _formatted_names(fmt): | |
def transform_names(self, in_names=None): | |
return [fmt.format(name) for name in in_names] | |
return transform_names | |
def _component_names(fmt, attr): | |
def transform_names(self, in_names=None): | |
return [fmt.format(i) for i in range(getattr(self, attr))] | |
return transform_names | |
transform_feature_names.register(TfidfTransformer)(_formatted_names('tfidf({})')) | |
transform_feature_names.register(Imputer)(_formatted_names('impute({})')) | |
transform_feature_names.register(LatentDirichletAllocation)(_component_names('topic({})', 'n_topics')) | |
class FunctionTransformer(FunctionTransformer): | |
def __init__(self, func=None, inverse_func=None, | |
feature_name_func=None, | |
validate=True, | |
accept_sparse=False, pass_y=False, | |
kw_args=None, inv_kw_args=None): | |
super(type(self), self).__init__( | |
func=func, inverse_func=inverse_func, validate=validate, | |
accept_sparse=accept_sparse, pass_y=pass_y, | |
kw_args=kw_args, inv_kw_args=inv_kw_args) | |
self.feature_name_func = feature_name_func | |
@transform_feature_names.register(FunctionTransformer) | |
def _function_transformer_names(est, in_names=None): | |
return est.feature_name_func(in_names) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
I'm writing an estimator that just selects a subset from the input and am having
get_feature_names()
not being implemented in this intermediate step being a problem.My implementation strongly resembles this:
http://stackoverflow.com/questions/42479370/getting-feature-names-from-within-a-featureunion-pipeline
Do you have any advice on using your method to patch a featureUnion with pipelines?