kmike/hattrie_vectorizer.py

## hattrie_vectorizer.py
import numpy as np
import scipy.sparse as sp
import hat_trie
from sklearn.feature_extraction.text import CountVectorizer, _make_int_array


class HatTrieCountVectorizer(CountVectorizer):

    def _count_vocab(self, raw_documents, fixed_vocab):
        """Create sparse feature matrix, and vocabulary where fixed_vocab=False
        """
        if fixed_vocab:
            raise NotImplementedError()

        vocabulary = hat_trie.Trie()

        analyze = self.build_analyzer()
        j_indices = _make_int_array()
        indptr = _make_int_array()
        indptr.append(0)
        for doc in raw_documents:
            for feature in analyze(doc):
                if feature not in vocabulary:
                    idx = len(vocabulary)
                    vocabulary[feature] = idx
                    j_indices.append(idx)
                else:
                    try:
                        j_indices.append(vocabulary[feature])
                    except KeyError:
                        # Ignore out-of-vocabulary items for fixed_vocab=True
                        continue
            indptr.append(len(j_indices))

        # some Python/Scipy versions won't accept an array.array:
        if j_indices:
            j_indices = np.frombuffer(j_indices, dtype=np.intc)
        else:
            j_indices = np.array([], dtype=np.int32)
        indptr = np.frombuffer(indptr, dtype=np.intc)
        values = np.ones(len(j_indices))

        X = sp.csr_matrix((values, j_indices, indptr),
                          shape=(len(indptr) - 1, len(vocabulary)),
                          dtype=self.dtype)
        X.sum_duplicates()
        return vocabulary, X

    def _sort_features(self, X, vocabulary):
        return X

    def _limit_features(self, X, vocabulary, high=None, low=None,
                        limit=None):
        return X, set()
	import numpy as np
	import scipy.sparse as sp
	import hat_trie
	from sklearn.feature_extraction.text import CountVectorizer, _make_int_array


	class HatTrieCountVectorizer(CountVectorizer):

	def _count_vocab(self, raw_documents, fixed_vocab):
	"""Create sparse feature matrix, and vocabulary where fixed_vocab=False
	"""
	if fixed_vocab:
	raise NotImplementedError()

	vocabulary = hat_trie.Trie()

	analyze = self.build_analyzer()
	j_indices = _make_int_array()
	indptr = _make_int_array()
	indptr.append(0)
	for doc in raw_documents:
	for feature in analyze(doc):
	if feature not in vocabulary:
	idx = len(vocabulary)
	vocabulary[feature] = idx
	j_indices.append(idx)
	else:
	try:
	j_indices.append(vocabulary[feature])
	except KeyError:
	# Ignore out-of-vocabulary items for fixed_vocab=True
	continue
	indptr.append(len(j_indices))

	# some Python/Scipy versions won't accept an array.array:
	if j_indices:
	j_indices = np.frombuffer(j_indices, dtype=np.intc)
	else:
	j_indices = np.array([], dtype=np.int32)
	indptr = np.frombuffer(indptr, dtype=np.intc)
	values = np.ones(len(j_indices))

	X = sp.csr_matrix((values, j_indices, indptr),
	shape=(len(indptr) - 1, len(vocabulary)),
	dtype=self.dtype)
	X.sum_duplicates()
	return vocabulary, X

	def _sort_features(self, X, vocabulary):
	return X

	def _limit_features(self, X, vocabulary, high=None, low=None,
	limit=None):
	return X, set()