sanchezg/projection_count_vectorizer.py

## projection_count_vectorizer.py
from sklearn.feature_extraction.text import CountVectorizer

class ProjectionCountVectorizer(CountVectorizer):

    def __init__(self, projection_path, *args, **kwargs):
        self.projection_path = projection_path.split('/')
        super().__init__(*args, **kwargs)

    def build_preprocessor(self):
        built = super().build_preprocessor()

        def projection_and_preprocess(doc):
            return built(self.do_projection(doc))
        return projection_and_preprocess

    def do_projection(self, doc):
        for step in self.projection_path:
            if isinstance(doc, dict):
                doc = doc[step]
            elif isinstance(doc, (tuple, list)):
                if step.isdigit():
                    doc = doc[int(step)]
                else:   # only valid for namedtuples
                    doc = getattr(doc, step)
            else:
                raise ValueError('cant apply step %s' % step)
        return doc
	from sklearn.feature_extraction.text import CountVectorizer

	class ProjectionCountVectorizer(CountVectorizer):

	def __init__(self, projection_path, args, *kwargs):
	self.projection_path = projection_path.split('/')
	super().__init__(args, *kwargs)

	def build_preprocessor(self):
	built = super().build_preprocessor()

	def projection_and_preprocess(doc):
	return built(self.do_projection(doc))
	return projection_and_preprocess

	def do_projection(self, doc):
	for step in self.projection_path:
	if isinstance(doc, dict):
	doc = doc[step]
	elif isinstance(doc, (tuple, list)):
	if step.isdigit():
	doc = doc[int(step)]
	else: # only valid for namedtuples
	doc = getattr(doc, step)
	else:
	raise ValueError('cant apply step %s' % step)
	return doc