Skip to content

Instantly share code, notes, and snippets.

@kmike
Created March 27, 2014 11:21
Show Gist options
  • Save kmike/9805389 to your computer and use it in GitHub Desktop.
Save kmike/9805389 to your computer and use it in GitHub Desktop.
import numpy as np
import scipy.sparse as sp
import hat_trie
from sklearn.feature_extraction.text import CountVectorizer, _make_int_array
class HatTrieCountVectorizer(CountVectorizer):
def _count_vocab(self, raw_documents, fixed_vocab):
"""Create sparse feature matrix, and vocabulary where fixed_vocab=False
"""
if fixed_vocab:
raise NotImplementedError()
vocabulary = hat_trie.Trie()
analyze = self.build_analyzer()
j_indices = _make_int_array()
indptr = _make_int_array()
indptr.append(0)
for doc in raw_documents:
for feature in analyze(doc):
if feature not in vocabulary:
idx = len(vocabulary)
vocabulary[feature] = idx
j_indices.append(idx)
else:
try:
j_indices.append(vocabulary[feature])
except KeyError:
# Ignore out-of-vocabulary items for fixed_vocab=True
continue
indptr.append(len(j_indices))
# some Python/Scipy versions won't accept an array.array:
if j_indices:
j_indices = np.frombuffer(j_indices, dtype=np.intc)
else:
j_indices = np.array([], dtype=np.int32)
indptr = np.frombuffer(indptr, dtype=np.intc)
values = np.ones(len(j_indices))
X = sp.csr_matrix((values, j_indices, indptr),
shape=(len(indptr) - 1, len(vocabulary)),
dtype=self.dtype)
X.sum_duplicates()
return vocabulary, X
def _sort_features(self, X, vocabulary):
return X
def _limit_features(self, X, vocabulary, high=None, low=None,
limit=None):
return X, set()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment