Skip to content

Instantly share code, notes, and snippets.

@kmike
Last active December 21, 2017 05:17
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save kmike/9819115 to your computer and use it in GitHub Desktop.
Save kmike/9819115 to your computer and use it in GitHub Desktop.
Folder structure should be the following:
vectorizers/
vec/
stored/
string_dict/
string_dict.pyx
setup.py
marisa_vectorizers.py
memusage_fit.py
memusage_transform.py
Run setup.py install from the string_dict folder,
then run memusage_fit.py with one of the options,
then run memusage_transform.py with the same option.
import string
import numpy as np
import scipy.sparse as sp
import marisa_trie
import hat_trie
# import datrie
# import chartrie
from string_dict import UnicodeIntDict, UnicodeIntDict2
from sklearn.externals import six
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, _make_int_array
# hack to store vocabulary in MARISA Trie
class _MarisaVocabularyMixin(object):
def fit_transform(self, raw_documents, y=None):
super(_MarisaVocabularyMixin, self).fit_transform(raw_documents)
self._freeze_vocabulary()
return super(_MarisaVocabularyMixin, self).fit_transform(raw_documents, y)
def _freeze_vocabulary(self):
if not self.fixed_vocabulary:
self.vocabulary_ = marisa_trie.Trie(self.vocabulary_.keys())
self.fixed_vocabulary = True
del self.stop_words_
class MarisaCountVectorizerOld(_MarisaVocabularyMixin, CountVectorizer):
pass
class ReducedCountVectorizer(CountVectorizer):
def _sort_features(self, X, vocabulary):
return X
def _limit_features(self, X, vocabulary, high=None, low=None,
limit=None):
return X, set()
class _TrieCountVectorizer(ReducedCountVectorizer):
trie_cls = None
def _count_vocab(self, raw_documents, fixed_vocab):
"""Create sparse feature matrix, and vocabulary where fixed_vocab=False
"""
if fixed_vocab:
raise NotImplementedError()
vocabulary = self.trie_cls()
analyze = self.build_analyzer()
j_indices = _make_int_array()
indptr = _make_int_array()
indptr.append(0)
for doc in raw_documents:
for feature in analyze(doc):
if feature not in vocabulary:
idx = len(vocabulary)
vocabulary[feature] = idx
j_indices.append(idx)
else:
try:
j_indices.append(vocabulary[feature])
except KeyError:
# Ignore out-of-vocabulary items for fixed_vocab=True
continue
indptr.append(len(j_indices))
# some Python/Scipy versions won't accept an array.array:
if j_indices:
j_indices = np.frombuffer(j_indices, dtype=np.intc)
else:
j_indices = np.array([], dtype=np.int32)
indptr = np.frombuffer(indptr, dtype=np.intc)
values = np.ones(len(j_indices))
X = sp.csr_matrix((values, j_indices, indptr),
shape=(len(indptr) - 1, len(vocabulary)),
dtype=self.dtype)
X.sum_duplicates()
return vocabulary, X
def _sort_features(self, X, vocabulary):
return X
def _limit_features(self, X, vocabulary, high=None, low=None,
limit=None):
return X, set()
class HatTrieCountVectorizer(_TrieCountVectorizer):
trie_cls = hat_trie.Trie
# class DatrieCountVectorizer(_TrieCountVectorizer): # it segfaults
# trie_cls = lambda *args: datrie.Trie(ranges=[(chr(1), chr(255))])
# class ChartrieCountVectorizer(_TrieCountVectorizer): # can't get it work
# trie_cls = chartrie.CharTrie
class StdCountVectorizer(_TrieCountVectorizer): # this is not a trie, I know
trie_cls = UnicodeIntDict2
class MarisaCountVectorizer(CountVectorizer):
# ``CountVectorizer.fit`` method calls ``fit_transform`` so
# ``fit`` is not provided
def fit_transform(self, raw_documents, y=None):
X = super(MarisaCountVectorizer, self).fit_transform(raw_documents)
X = self._freeze_vocabulary(X)
return X
def _freeze_vocabulary(self, X=None):
if not self.fixed_vocabulary:
frozen = marisa_trie.Trie(six.iterkeys(self.vocabulary_))
if X is not None:
X = self._reorder_features(X, self.vocabulary_, frozen)
self.vocabulary_ = frozen
self.fixed_vocabulary = True
del self.stop_words_
return X
def _reorder_features(self, X, old_vocabulary, new_vocabulary):
map_index = np.empty(len(old_vocabulary), dtype=np.int32)
for term, new_val in six.iteritems(new_vocabulary):
map_index[new_val] = old_vocabulary[term]
return X[:, map_index]
class MarisaTfidfVectorizer(TfidfVectorizer):
def fit_transform(self, raw_documents, y=None):
super(MarisaTfidfVectorizer, self).fit_transform(raw_documents)
self._freeze_vocabulary()
return super(MarisaTfidfVectorizer, self).fit_transform(raw_documents, y)
def fit(self, raw_documents, y=None):
super(MarisaTfidfVectorizer, self).fit(raw_documents)
self._freeze_vocabulary()
return super(MarisaTfidfVectorizer, self).fit(raw_documents, y)
def _freeze_vocabulary(self, X=None):
if not self.fixed_vocabulary:
self.vocabulary_ = marisa_trie.Trie(six.iterkeys(self.vocabulary_))
self.fixed_vocabulary = True
del self.stop_words_
#!/usr/bin/env python
from __future__ import division, print_function
import os
import sys
import time
import resource
import psutil
import gc
from sklearn import datasets
from sklearn.externals import joblib
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer
from marisa_vectorizers import (
MarisaCountVectorizer,
MarisaTfidfVectorizer,
MarisaCountVectorizerOld,
HatTrieCountVectorizer,
# DatrieCountVectorizer,
# ChartrieCountVectorizer,
StdCountVectorizer,
ReducedCountVectorizer,
)
vectorizers = dict(
count = CountVectorizer(),
count2 = CountVectorizer(ngram_range=(1,2)),
count3 = CountVectorizer(ngram_range=(1,3)),
count4 = CountVectorizer(ngram_range=(1,4)),
rcount = ReducedCountVectorizer(),
rcount2 = ReducedCountVectorizer(ngram_range=(1,2)),
rcount3 = ReducedCountVectorizer(ngram_range=(1,3)),
rcount4 = ReducedCountVectorizer(ngram_range=(1,4)),
tfidf = TfidfVectorizer(),
tfidf2 = TfidfVectorizer(ngram_range=(1,2)),
hashing18 = HashingVectorizer(n_features=2**18),
hashing20 = HashingVectorizer(n_features=2**20),
marisa_count = MarisaCountVectorizer(),
marisa_count2 = MarisaCountVectorizer(ngram_range=(1,2)),
marisa_count3 = MarisaCountVectorizer(ngram_range=(1,3)),
marisa_count4 = MarisaCountVectorizer(ngram_range=(1,4)),
marisa_count_old = MarisaCountVectorizerOld(),
marisa_count2_old = MarisaCountVectorizerOld(ngram_range=(1,2)),
marisa_tfidf = MarisaTfidfVectorizer(),
marisa_tfidf2 = MarisaTfidfVectorizer(ngram_range=(1,2)),
hattrie_count = HatTrieCountVectorizer(),
hattrie_count2 = HatTrieCountVectorizer(ngram_range=(1,2)),
hattrie_count3 = HatTrieCountVectorizer(ngram_range=(1,3)),
hattrie_count4 = HatTrieCountVectorizer(ngram_range=(1,4)),
# datrie_count = DatrieCountVectorizer(),
# datrie_count2 = DatrieCountVectorizer(ngram_range=(1,2)),
# chartrie_count = ChartrieCountVectorizer(),
# chartrie_count2 = ChartrieCountVectorizer(ngram_range=(1,2)),
std_count = StdCountVectorizer(),
std_count2 = StdCountVectorizer(ngram_range=(1,2)),
std_count3 = StdCountVectorizer(ngram_range=(1,3)),
std_count4 = StdCountVectorizer(ngram_range=(1,4)),
)
if __name__ == '__main__':
if len(sys.argv) < 2:
print("Available vectorizers:\n")
print("\n".join(sorted(vectorizers.keys())))
sys.exit()
vecname = sys.argv[1]
vec = vectorizers[vecname]
newsgroups_train = datasets.fetch_20newsgroups(subset='train')
p = psutil.Process(os.getpid())
before = p.get_memory_info().rss / 2**20
max_before = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 2**20
start = time.time()
vec.fit(newsgroups_train.data)
end = time.time()
after = p.get_memory_info().rss / 2**20
max_after = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 2**20
assert max_after >= max_before
print("fit time: %0.1fs" % (end-start))
print("fit memory usage: %0.1fMB" % (max_after-before))
before2 = p.get_memory_info().rss / 2**20
joblib.dump(vec, os.path.join("vec", vecname+".joblib"))
max_after2 = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 2**20
assert max_after2 >= max_after
print("dump time: %0.1fs" % (time.time()-end))
print("dump memory usage: %0.1fMB" % (max_after2-before2))
#!/usr/bin/env python
from __future__ import division, print_function
import os
import sys
import time
import psutil
from sklearn.externals import joblib
from sklearn import datasets
# the following imports are not needed, but if we won't import them
# memory usage numbers will include memory required for loading these modules
import array
import cPickle
from collections import defaultdict
import numpy
import scipy.sparse
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer
from sklearn.feature_extraction import DictVectorizer
def _transform(vec, data):
# XXX: this code leaks memory:
X = vec.transform(data)
# XXX: and this code doesn't leak memory - why?
# for doc in data:
# X = vec.transform([doc])
return X.shape[1]
if __name__ == '__main__':
vecname = sys.argv[1]
fname = os.path.join('vec', vecname+'.joblib')
newsgroups_test = datasets.fetch_20newsgroups(subset='test')
p = psutil.Process(os.getpid())
before = p.get_memory_info().rss / 2**20
start_load = time.time()
vec = joblib.load(fname)
end_load = time.time()
after_load = p.get_memory_info().rss / 2**20
start_transform = time.time()
n_features = _transform(vec, newsgroups_test.data)
end_transform = time.time()
print("transform features: %d" % n_features)
after_transform = p.get_memory_info().rss / 2**20
print("load time: %0.1fs" % (end_load-start_load))
print("load memory usage: %0.1fMB" % (after_load-before))
print("transform time: %0.1fs" % (end_transform-start_transform))
print("transform memory leak: %0.1fMB" % (after_transform-after_load))
# print("total memory usage: %0.1fMB" % (after_transform-before))
#!/usr/bin/env python
from distutils.core import setup
from distutils.extension import Extension
from Cython.Distutils import build_ext
setup(
name='string_dict',
cmdclass = {'build_ext': build_ext},
ext_modules = [Extension("string_dict", ["string_dict.pyx"], language='c++')]
)
from cython.operator cimport dereference as deref
from libcpp.string cimport string as cpp_string
from libcpp.map cimport map as cpp_map
from libcpp.utility cimport pair
cdef extern from "<unordered_map>" namespace "std":
cdef cppclass unordered_map[T, U]:
cppclass iterator:
pair[T,U]& operator*()
iterator operator++()
iterator operator--()
bint operator==(iterator)
bint operator!=(iterator)
cppclass reverse_iterator:
pair[T,U]& operator*()
iterator operator++()
iterator operator--()
bint operator==(reverse_iterator)
bint operator!=(reverse_iterator)
#cppclass const_iterator(iterator):
# pass
#cppclass const_reverse_iterator(reverse_iterator):
# pass
unordered_map()
unordered_map(unordered_map&)
#unordered_map(key_compare&)
U& operator[](T&)
#unordered_map& operator=(unordered_map&)
bint operator==(unordered_map&, unordered_map&)
bint operator!=(unordered_map&, unordered_map&)
bint operator<(unordered_map&, unordered_map&)
bint operator>(unordered_map&, unordered_map&)
bint operator<=(unordered_map&, unordered_map&)
bint operator>=(unordered_map&, unordered_map&)
U& at(T&)
iterator begin()
#const_iterator begin()
void clear()
size_t count(T&)
bint empty()
iterator end()
#const_iterator end()
pair[iterator, iterator] equal_range(T&)
#pair[const_iterator, const_iterator] equal_range(key_type&)
void erase(iterator)
void erase(iterator, iterator)
size_t erase(T&)
iterator find(T&)
#const_iterator find(key_type&)
pair[iterator, bint] insert(pair[T,U]) # XXX pair[T,U]&
iterator insert(iterator, pair[T,U]) # XXX pair[T,U]&
#void insert(input_iterator, input_iterator)
#key_compare key_comp()
iterator lower_bound(T&)
#const_iterator lower_bound(key_type&)
size_t max_size()
reverse_iterator rbegin()
#const_reverse_iterator rbegin()
reverse_iterator rend()
#const_reverse_iterator rend()
size_t size()
void swap(unordered_map&)
iterator upper_bound(T&)
#const_iterator upper_bound(key_type&)
#value_compare value_comp()
cdef class UnicodeIntDict2:
cdef unordered_map[cpp_string, int] _map
def __init__(self):
self._map = unordered_map[cpp_string, int]()
def __setitem__(self, unicode key, int value):
self._map[key.encode('utf8')] = value
def __getitem__(self, unicode key):
cdef unordered_map[cpp_string, int].iterator it = self._map.find(key.encode('utf8'))
if it == self._map.end():
raise KeyError(key)
return deref(it).second
def __contains__(self, unicode key):
cdef unordered_map[cpp_string, int].iterator it = self._map.find(key.encode('utf8'))
return not it == self._map.end()
def __len__(self):
return self._map.size()
cdef class UnicodeIntDict:
cdef cpp_map[cpp_string, int] _map
def __init__(self):
self._map = cpp_map[cpp_string, int]()
def __setitem__(self, unicode key, int value):
self._map[key.encode('utf8')] = value
def __getitem__(self, unicode key):
cdef cpp_map[cpp_string, int].iterator it = self._map.find(key.encode('utf8'))
if it == self._map.end():
raise KeyError(key)
return deref(it).second
def __contains__(self, unicode key):
cdef cpp_map[cpp_string, int].iterator it = self._map.find(key.encode('utf8'))
return not it == self._map.end()
def __len__(self):
return self._map.size()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment