kmike/README.txt

## README.txt
Folder structure should be the following:

vectorizers/
    vec/
    stored/
    string_dict/
        string_dict.pyx
        setup.py
    marisa_vectorizers.py
    memusage_fit.py
    memusage_transform.py

Run setup.py install from the string_dict folder,
then run memusage_fit.py with one of the options,
then run memusage_transform.py with the same option.

## marisa_vectorizers.py
import string
import numpy as np
import scipy.sparse as sp
import marisa_trie
import hat_trie
# import datrie
# import chartrie
from string_dict import UnicodeIntDict, UnicodeIntDict2

from sklearn.externals import six
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, _make_int_array


# hack to store vocabulary in MARISA Trie
class _MarisaVocabularyMixin(object):
    def fit_transform(self, raw_documents, y=None):
        super(_MarisaVocabularyMixin, self).fit_transform(raw_documents)
        self._freeze_vocabulary()
        return super(_MarisaVocabularyMixin, self).fit_transform(raw_documents, y)

    def _freeze_vocabulary(self):
        if not self.fixed_vocabulary:
            self.vocabulary_ = marisa_trie.Trie(self.vocabulary_.keys())
            self.fixed_vocabulary = True
            del self.stop_words_


class MarisaCountVectorizerOld(_MarisaVocabularyMixin, CountVectorizer):
    pass


class ReducedCountVectorizer(CountVectorizer):
    def _sort_features(self, X, vocabulary):
        return X

    def _limit_features(self, X, vocabulary, high=None, low=None,
                        limit=None):
        return X, set()


class _TrieCountVectorizer(ReducedCountVectorizer):
    trie_cls = None

    def _count_vocab(self, raw_documents, fixed_vocab):
        """Create sparse feature matrix, and vocabulary where fixed_vocab=False
        """
        if fixed_vocab:
            raise NotImplementedError()

        vocabulary = self.trie_cls()

        analyze = self.build_analyzer()
        j_indices = _make_int_array()
        indptr = _make_int_array()
        indptr.append(0)
        for doc in raw_documents:
            for feature in analyze(doc):
                if feature not in vocabulary:
                    idx = len(vocabulary)
                    vocabulary[feature] = idx
                    j_indices.append(idx)
                else:
                    try:
                        j_indices.append(vocabulary[feature])
                    except KeyError:
                        # Ignore out-of-vocabulary items for fixed_vocab=True
                        continue
            indptr.append(len(j_indices))

        # some Python/Scipy versions won't accept an array.array:
        if j_indices:
            j_indices = np.frombuffer(j_indices, dtype=np.intc)
        else:
            j_indices = np.array([], dtype=np.int32)
        indptr = np.frombuffer(indptr, dtype=np.intc)
        values = np.ones(len(j_indices))

        X = sp.csr_matrix((values, j_indices, indptr),
                          shape=(len(indptr) - 1, len(vocabulary)),
                          dtype=self.dtype)
        X.sum_duplicates()
        return vocabulary, X

    def _sort_features(self, X, vocabulary):
        return X

    def _limit_features(self, X, vocabulary, high=None, low=None,
                        limit=None):
        return X, set()


class HatTrieCountVectorizer(_TrieCountVectorizer):
    trie_cls = hat_trie.Trie

# class DatrieCountVectorizer(_TrieCountVectorizer):  # it segfaults
#     trie_cls = lambda *args: datrie.Trie(ranges=[(chr(1), chr(255))])

# class ChartrieCountVectorizer(_TrieCountVectorizer):  # can't get it work
#     trie_cls = chartrie.CharTrie

class StdCountVectorizer(_TrieCountVectorizer):  # this is not a trie, I know
    trie_cls = UnicodeIntDict2


class MarisaCountVectorizer(CountVectorizer):

    # ``CountVectorizer.fit`` method calls ``fit_transform`` so
    # ``fit`` is not provided
    def fit_transform(self, raw_documents, y=None):
        X = super(MarisaCountVectorizer, self).fit_transform(raw_documents)
        X = self._freeze_vocabulary(X)
        return X

    def _freeze_vocabulary(self, X=None):
        if not self.fixed_vocabulary:
            frozen = marisa_trie.Trie(six.iterkeys(self.vocabulary_))
            if X is not None:
                X = self._reorder_features(X, self.vocabulary_, frozen)
            self.vocabulary_ = frozen
            self.fixed_vocabulary = True
            del self.stop_words_
        return X

    def _reorder_features(self, X, old_vocabulary, new_vocabulary):
        map_index = np.empty(len(old_vocabulary), dtype=np.int32)
        for term, new_val in six.iteritems(new_vocabulary):
            map_index[new_val] = old_vocabulary[term]
        return X[:, map_index]


class MarisaTfidfVectorizer(TfidfVectorizer):

    def fit_transform(self, raw_documents, y=None):
        super(MarisaTfidfVectorizer, self).fit_transform(raw_documents)
        self._freeze_vocabulary()
        return super(MarisaTfidfVectorizer, self).fit_transform(raw_documents, y)

    def fit(self, raw_documents, y=None):
        super(MarisaTfidfVectorizer, self).fit(raw_documents)
        self._freeze_vocabulary()
        return super(MarisaTfidfVectorizer, self).fit(raw_documents, y)

    def _freeze_vocabulary(self, X=None):
        if not self.fixed_vocabulary:
            self.vocabulary_ = marisa_trie.Trie(six.iterkeys(self.vocabulary_))
            self.fixed_vocabulary = True
            del self.stop_words_

## memusage_fit.py
#!/usr/bin/env python
from __future__ import division, print_function
import os
import sys
import time
import resource
import psutil
import gc
from sklearn import datasets
from sklearn.externals import joblib
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer
from marisa_vectorizers import (
    MarisaCountVectorizer,
    MarisaTfidfVectorizer,
    MarisaCountVectorizerOld,
    HatTrieCountVectorizer,
    # DatrieCountVectorizer,
    # ChartrieCountVectorizer,
    StdCountVectorizer,

    ReducedCountVectorizer,
)

vectorizers = dict(
    count = CountVectorizer(),
    count2 = CountVectorizer(ngram_range=(1,2)),
    count3 = CountVectorizer(ngram_range=(1,3)),
    count4 = CountVectorizer(ngram_range=(1,4)),
    rcount = ReducedCountVectorizer(),
    rcount2 = ReducedCountVectorizer(ngram_range=(1,2)),
    rcount3 = ReducedCountVectorizer(ngram_range=(1,3)),
    rcount4 = ReducedCountVectorizer(ngram_range=(1,4)),

    tfidf = TfidfVectorizer(),
    tfidf2 = TfidfVectorizer(ngram_range=(1,2)),
    hashing18 = HashingVectorizer(n_features=2**18),
    hashing20 = HashingVectorizer(n_features=2**20),
    marisa_count = MarisaCountVectorizer(),
    marisa_count2 = MarisaCountVectorizer(ngram_range=(1,2)),
    marisa_count3 = MarisaCountVectorizer(ngram_range=(1,3)),
    marisa_count4 = MarisaCountVectorizer(ngram_range=(1,4)),
    marisa_count_old = MarisaCountVectorizerOld(),
    marisa_count2_old = MarisaCountVectorizerOld(ngram_range=(1,2)),
    marisa_tfidf = MarisaTfidfVectorizer(),
    marisa_tfidf2 = MarisaTfidfVectorizer(ngram_range=(1,2)),

    hattrie_count = HatTrieCountVectorizer(),
    hattrie_count2 = HatTrieCountVectorizer(ngram_range=(1,2)),
    hattrie_count3 = HatTrieCountVectorizer(ngram_range=(1,3)),
    hattrie_count4 = HatTrieCountVectorizer(ngram_range=(1,4)),
    # datrie_count = DatrieCountVectorizer(),
    # datrie_count2 = DatrieCountVectorizer(ngram_range=(1,2)),
    # chartrie_count = ChartrieCountVectorizer(),
    # chartrie_count2 = ChartrieCountVectorizer(ngram_range=(1,2)),

    std_count = StdCountVectorizer(),
    std_count2 = StdCountVectorizer(ngram_range=(1,2)),
    std_count3 = StdCountVectorizer(ngram_range=(1,3)),
    std_count4 = StdCountVectorizer(ngram_range=(1,4)),
)

if __name__ == '__main__':
    if len(sys.argv) < 2:
        print("Available vectorizers:\n")
        print("\n".join(sorted(vectorizers.keys())))
        sys.exit()

    vecname = sys.argv[1]
    vec = vectorizers[vecname]

    newsgroups_train = datasets.fetch_20newsgroups(subset='train')

    p = psutil.Process(os.getpid())
    before = p.get_memory_info().rss / 2**20
    max_before = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 2**20
    start = time.time()

    vec.fit(newsgroups_train.data)

    end = time.time()
    after = p.get_memory_info().rss / 2**20
    max_after = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 2**20
    assert max_after >= max_before

    print("fit time: %0.1fs" % (end-start))
    print("fit memory usage: %0.1fMB" % (max_after-before))

    before2 = p.get_memory_info().rss / 2**20

    joblib.dump(vec, os.path.join("vec", vecname+".joblib"))

    max_after2 = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 2**20
    assert max_after2 >= max_after

    print("dump time: %0.1fs" % (time.time()-end))
    print("dump memory usage: %0.1fMB" % (max_after2-before2))

## memusage_transform.py
#!/usr/bin/env python
from __future__ import division, print_function
import os
import sys
import time
import psutil
from sklearn.externals import joblib
from sklearn import datasets

# the following imports are not needed, but if we won't import them
# memory usage numbers will include memory required for loading these modules
import array
import cPickle
from collections import defaultdict
import numpy
import scipy.sparse
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer
from sklearn.feature_extraction import DictVectorizer


def _transform(vec, data):
    # XXX: this code leaks memory:
    X = vec.transform(data)

    # XXX: and this code doesn't leak memory - why?
    # for doc in data:
    #     X = vec.transform([doc])

    return X.shape[1]

if __name__ == '__main__':
    vecname = sys.argv[1]
    fname = os.path.join('vec', vecname+'.joblib')

    newsgroups_test = datasets.fetch_20newsgroups(subset='test')

    p = psutil.Process(os.getpid())
    before = p.get_memory_info().rss / 2**20

    start_load = time.time()
    vec = joblib.load(fname)
    end_load = time.time()
    after_load = p.get_memory_info().rss / 2**20

    start_transform = time.time()
    n_features = _transform(vec, newsgroups_test.data)
    end_transform = time.time()

    print("transform features: %d" % n_features)

    after_transform = p.get_memory_info().rss / 2**20
    print("load time: %0.1fs" % (end_load-start_load))
    print("load memory usage: %0.1fMB" % (after_load-before))
    print("transform time: %0.1fs" % (end_transform-start_transform))
    print("transform memory leak: %0.1fMB" % (after_transform-after_load))
    # print("total memory usage: %0.1fMB" % (after_transform-before))

## setup.py
#!/usr/bin/env python
from distutils.core import setup
from distutils.extension import Extension
from Cython.Distutils import build_ext

setup(
    name='string_dict',
    cmdclass = {'build_ext': build_ext},
    ext_modules = [Extension("string_dict", ["string_dict.pyx"], language='c++')]
)

## string_dict.pyx
from cython.operator cimport dereference as deref
from libcpp.string cimport string as cpp_string
from libcpp.map cimport map as cpp_map


from libcpp.utility cimport pair
cdef extern from "<unordered_map>" namespace "std":
    cdef cppclass unordered_map[T, U]:
        cppclass iterator:
            pair[T,U]& operator*()
            iterator operator++()
            iterator operator--()
            bint operator==(iterator)
            bint operator!=(iterator)
        cppclass reverse_iterator:
            pair[T,U]& operator*()
            iterator operator++()
            iterator operator--()
            bint operator==(reverse_iterator)
            bint operator!=(reverse_iterator)
        #cppclass const_iterator(iterator):
        #    pass
        #cppclass const_reverse_iterator(reverse_iterator):
        #    pass
        unordered_map()
        unordered_map(unordered_map&)
        #unordered_map(key_compare&)
        U& operator[](T&)
        #unordered_map& operator=(unordered_map&)
        bint operator==(unordered_map&, unordered_map&)
        bint operator!=(unordered_map&, unordered_map&)
        bint operator<(unordered_map&, unordered_map&)
        bint operator>(unordered_map&, unordered_map&)
        bint operator<=(unordered_map&, unordered_map&)
        bint operator>=(unordered_map&, unordered_map&)
        U& at(T&)
        iterator begin()
        #const_iterator begin()
        void clear()
        size_t count(T&)
        bint empty()
        iterator end()
        #const_iterator end()
        pair[iterator, iterator] equal_range(T&)
        #pair[const_iterator, const_iterator] equal_range(key_type&)
        void erase(iterator)
        void erase(iterator, iterator)
        size_t erase(T&)
        iterator find(T&)
        #const_iterator find(key_type&)
        pair[iterator, bint] insert(pair[T,U]) # XXX pair[T,U]&
        iterator insert(iterator, pair[T,U]) # XXX pair[T,U]&
        #void insert(input_iterator, input_iterator)
        #key_compare key_comp()
        iterator lower_bound(T&)
        #const_iterator lower_bound(key_type&)
        size_t max_size()
        reverse_iterator rbegin()
        #const_reverse_iterator rbegin()
        reverse_iterator rend()
        #const_reverse_iterator rend()
        size_t size()
        void swap(unordered_map&)
        iterator upper_bound(T&)
        #const_iterator upper_bound(key_type&)
        #value_compare value_comp()


cdef class UnicodeIntDict2:
    cdef unordered_map[cpp_string, int] _map

    def __init__(self):
        self._map = unordered_map[cpp_string, int]()

    def __setitem__(self, unicode key, int value):
        self._map[key.encode('utf8')] = value

    def __getitem__(self, unicode key):
        cdef unordered_map[cpp_string, int].iterator it = self._map.find(key.encode('utf8'))
        if it == self._map.end():
            raise KeyError(key)
        return deref(it).second

    def __contains__(self, unicode key):
        cdef unordered_map[cpp_string, int].iterator it = self._map.find(key.encode('utf8'))
        return not it == self._map.end()

    def __len__(self):
        return self._map.size()


cdef class UnicodeIntDict:
    cdef cpp_map[cpp_string, int] _map

    def __init__(self):
        self._map = cpp_map[cpp_string, int]()

    def __setitem__(self, unicode key, int value):
        self._map[key.encode('utf8')] = value

    def __getitem__(self, unicode key):
        cdef cpp_map[cpp_string, int].iterator it = self._map.find(key.encode('utf8'))
        if it == self._map.end():
            raise KeyError(key)
        return deref(it).second

    def __contains__(self, unicode key):
        cdef cpp_map[cpp_string, int].iterator it = self._map.find(key.encode('utf8'))
        return not it == self._map.end()

    def __len__(self):
        return self._map.size()
	Folder structure should be the following:

	vectorizers/
	vec/
	stored/
	string_dict/
	string_dict.pyx
	setup.py
	marisa_vectorizers.py
	memusage_fit.py
	memusage_transform.py

	Run setup.py install from the string_dict folder,
	then run memusage_fit.py with one of the options,
	then run memusage_transform.py with the same option.
	import string
	import numpy as np
	import scipy.sparse as sp
	import marisa_trie
	import hat_trie
	# import datrie
	# import chartrie
	from string_dict import UnicodeIntDict, UnicodeIntDict2

	from sklearn.externals import six
	from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, _make_int_array


	# hack to store vocabulary in MARISA Trie
	class _MarisaVocabularyMixin(object):
	def fit_transform(self, raw_documents, y=None):
	super(_MarisaVocabularyMixin, self).fit_transform(raw_documents)
	self._freeze_vocabulary()
	return super(_MarisaVocabularyMixin, self).fit_transform(raw_documents, y)

	def _freeze_vocabulary(self):
	if not self.fixed_vocabulary:
	self.vocabulary_ = marisa_trie.Trie(self.vocabulary_.keys())
	self.fixed_vocabulary = True
	del self.stop_words_


	class MarisaCountVectorizerOld(_MarisaVocabularyMixin, CountVectorizer):
	pass


	class ReducedCountVectorizer(CountVectorizer):
	def _sort_features(self, X, vocabulary):
	return X

	def _limit_features(self, X, vocabulary, high=None, low=None,
	limit=None):
	return X, set()


	class _TrieCountVectorizer(ReducedCountVectorizer):
	trie_cls = None

	def _count_vocab(self, raw_documents, fixed_vocab):
	"""Create sparse feature matrix, and vocabulary where fixed_vocab=False
	"""
	if fixed_vocab:
	raise NotImplementedError()

	vocabulary = self.trie_cls()

	analyze = self.build_analyzer()
	j_indices = _make_int_array()
	indptr = _make_int_array()
	indptr.append(0)
	for doc in raw_documents:
	for feature in analyze(doc):
	if feature not in vocabulary:
	idx = len(vocabulary)
	vocabulary[feature] = idx
	j_indices.append(idx)
	else:
	try:
	j_indices.append(vocabulary[feature])
	except KeyError:
	# Ignore out-of-vocabulary items for fixed_vocab=True
	continue
	indptr.append(len(j_indices))

	# some Python/Scipy versions won't accept an array.array:
	if j_indices:
	j_indices = np.frombuffer(j_indices, dtype=np.intc)
	else:
	j_indices = np.array([], dtype=np.int32)
	indptr = np.frombuffer(indptr, dtype=np.intc)
	values = np.ones(len(j_indices))

	X = sp.csr_matrix((values, j_indices, indptr),
	shape=(len(indptr) - 1, len(vocabulary)),
	dtype=self.dtype)
	X.sum_duplicates()
	return vocabulary, X

	def _sort_features(self, X, vocabulary):
	return X

	def _limit_features(self, X, vocabulary, high=None, low=None,
	limit=None):
	return X, set()


	class HatTrieCountVectorizer(_TrieCountVectorizer):
	trie_cls = hat_trie.Trie

	# class DatrieCountVectorizer(_TrieCountVectorizer): # it segfaults
	# trie_cls = lambda *args: datrie.Trie(ranges=[(chr(1), chr(255))])

	# class ChartrieCountVectorizer(_TrieCountVectorizer): # can't get it work
	# trie_cls = chartrie.CharTrie

	class StdCountVectorizer(_TrieCountVectorizer): # this is not a trie, I know
	trie_cls = UnicodeIntDict2


	class MarisaCountVectorizer(CountVectorizer):

	# ``CountVectorizer.fit`` method calls ``fit_transform`` so
	# ``fit`` is not provided
	def fit_transform(self, raw_documents, y=None):
	X = super(MarisaCountVectorizer, self).fit_transform(raw_documents)
	X = self._freeze_vocabulary(X)
	return X

	def _freeze_vocabulary(self, X=None):
	if not self.fixed_vocabulary:
	frozen = marisa_trie.Trie(six.iterkeys(self.vocabulary_))
	if X is not None:
	X = self._reorder_features(X, self.vocabulary_, frozen)
	self.vocabulary_ = frozen
	self.fixed_vocabulary = True
	del self.stop_words_
	return X

	def _reorder_features(self, X, old_vocabulary, new_vocabulary):
	map_index = np.empty(len(old_vocabulary), dtype=np.int32)
	for term, new_val in six.iteritems(new_vocabulary):
	map_index[new_val] = old_vocabulary[term]
	return X[:, map_index]


	class MarisaTfidfVectorizer(TfidfVectorizer):

	def fit_transform(self, raw_documents, y=None):
	super(MarisaTfidfVectorizer, self).fit_transform(raw_documents)
	self._freeze_vocabulary()
	return super(MarisaTfidfVectorizer, self).fit_transform(raw_documents, y)

	def fit(self, raw_documents, y=None):
	super(MarisaTfidfVectorizer, self).fit(raw_documents)
	self._freeze_vocabulary()
	return super(MarisaTfidfVectorizer, self).fit(raw_documents, y)

	def _freeze_vocabulary(self, X=None):
	if not self.fixed_vocabulary:
	self.vocabulary_ = marisa_trie.Trie(six.iterkeys(self.vocabulary_))
	self.fixed_vocabulary = True
	del self.stop_words_
	#!/usr/bin/env python
	from __future__ import division, print_function
	import os
	import sys
	import time
	import resource
	import psutil
	import gc
	from sklearn import datasets
	from sklearn.externals import joblib
	from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer
	from marisa_vectorizers import (
	MarisaCountVectorizer,
	MarisaTfidfVectorizer,
	MarisaCountVectorizerOld,
	HatTrieCountVectorizer,
	# DatrieCountVectorizer,
	# ChartrieCountVectorizer,
	StdCountVectorizer,

	ReducedCountVectorizer,
	)

	vectorizers = dict(
	count = CountVectorizer(),
	count2 = CountVectorizer(ngram_range=(1,2)),
	count3 = CountVectorizer(ngram_range=(1,3)),
	count4 = CountVectorizer(ngram_range=(1,4)),
	rcount = ReducedCountVectorizer(),
	rcount2 = ReducedCountVectorizer(ngram_range=(1,2)),
	rcount3 = ReducedCountVectorizer(ngram_range=(1,3)),
	rcount4 = ReducedCountVectorizer(ngram_range=(1,4)),

	tfidf = TfidfVectorizer(),
	tfidf2 = TfidfVectorizer(ngram_range=(1,2)),
	hashing18 = HashingVectorizer(n_features=2**18),
	hashing20 = HashingVectorizer(n_features=2**20),
	marisa_count = MarisaCountVectorizer(),
	marisa_count2 = MarisaCountVectorizer(ngram_range=(1,2)),
	marisa_count3 = MarisaCountVectorizer(ngram_range=(1,3)),
	marisa_count4 = MarisaCountVectorizer(ngram_range=(1,4)),
	marisa_count_old = MarisaCountVectorizerOld(),
	marisa_count2_old = MarisaCountVectorizerOld(ngram_range=(1,2)),
	marisa_tfidf = MarisaTfidfVectorizer(),
	marisa_tfidf2 = MarisaTfidfVectorizer(ngram_range=(1,2)),

	hattrie_count = HatTrieCountVectorizer(),
	hattrie_count2 = HatTrieCountVectorizer(ngram_range=(1,2)),
	hattrie_count3 = HatTrieCountVectorizer(ngram_range=(1,3)),
	hattrie_count4 = HatTrieCountVectorizer(ngram_range=(1,4)),
	# datrie_count = DatrieCountVectorizer(),
	# datrie_count2 = DatrieCountVectorizer(ngram_range=(1,2)),
	# chartrie_count = ChartrieCountVectorizer(),
	# chartrie_count2 = ChartrieCountVectorizer(ngram_range=(1,2)),

	std_count = StdCountVectorizer(),
	std_count2 = StdCountVectorizer(ngram_range=(1,2)),
	std_count3 = StdCountVectorizer(ngram_range=(1,3)),
	std_count4 = StdCountVectorizer(ngram_range=(1,4)),
	)

	if __name__ == '__main__':
	if len(sys.argv) < 2:
	print("Available vectorizers:\n")
	print("\n".join(sorted(vectorizers.keys())))
	sys.exit()

	vecname = sys.argv[1]
	vec = vectorizers[vecname]

	newsgroups_train = datasets.fetch_20newsgroups(subset='train')

	p = psutil.Process(os.getpid())
	before = p.get_memory_info().rss / 2**20
	max_before = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 2**20
	start = time.time()

	vec.fit(newsgroups_train.data)

	end = time.time()
	after = p.get_memory_info().rss / 2**20
	max_after = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 2**20
	assert max_after >= max_before

	print("fit time: %0.1fs" % (end-start))
	print("fit memory usage: %0.1fMB" % (max_after-before))

	before2 = p.get_memory_info().rss / 2**20

	joblib.dump(vec, os.path.join("vec", vecname+".joblib"))

	max_after2 = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 2**20
	assert max_after2 >= max_after

	print("dump time: %0.1fs" % (time.time()-end))
	print("dump memory usage: %0.1fMB" % (max_after2-before2))
	#!/usr/bin/env python
	from distutils.core import setup
	from distutils.extension import Extension
	from Cython.Distutils import build_ext

	setup(
	name='string_dict',
	cmdclass = {'build_ext': build_ext},
	ext_modules = [Extension("string_dict", ["string_dict.pyx"], language='c++')]
	)
	from cython.operator cimport dereference as deref
	from libcpp.string cimport string as cpp_string
	from libcpp.map cimport map as cpp_map


	from libcpp.utility cimport pair
	cdef extern from "<unordered_map>" namespace "std":
	cdef cppclass unordered_map[T, U]:
	cppclass iterator:
	pair[T,U]& operator*()
	iterator operator++()
	iterator operator--()
	bint operator==(iterator)
	bint operator!=(iterator)
	cppclass reverse_iterator:
	pair[T,U]& operator*()
	iterator operator++()
	iterator operator--()
	bint operator==(reverse_iterator)
	bint operator!=(reverse_iterator)
	#cppclass const_iterator(iterator):
	# pass
	#cppclass const_reverse_iterator(reverse_iterator):
	# pass
	unordered_map()
	unordered_map(unordered_map&)
	#unordered_map(key_compare&)
	U& operator[](T&)
	#unordered_map& operator=(unordered_map&)
	bint operator==(unordered_map&, unordered_map&)
	bint operator!=(unordered_map&, unordered_map&)
	bint operator<(unordered_map&, unordered_map&)
	bint operator>(unordered_map&, unordered_map&)
	bint operator<=(unordered_map&, unordered_map&)
	bint operator>=(unordered_map&, unordered_map&)
	U& at(T&)
	iterator begin()
	#const_iterator begin()
	void clear()
	size_t count(T&)
	bint empty()
	iterator end()
	#const_iterator end()
	pair[iterator, iterator] equal_range(T&)
	#pair[const_iterator, const_iterator] equal_range(key_type&)
	void erase(iterator)
	void erase(iterator, iterator)
	size_t erase(T&)
	iterator find(T&)
	#const_iterator find(key_type&)
	pair[iterator, bint] insert(pair[T,U]) # XXX pair[T,U]&
	iterator insert(iterator, pair[T,U]) # XXX pair[T,U]&
	#void insert(input_iterator, input_iterator)
	#key_compare key_comp()
	iterator lower_bound(T&)
	#const_iterator lower_bound(key_type&)
	size_t max_size()
	reverse_iterator rbegin()
	#const_reverse_iterator rbegin()
	reverse_iterator rend()
	#const_reverse_iterator rend()
	size_t size()
	void swap(unordered_map&)
	iterator upper_bound(T&)
	#const_iterator upper_bound(key_type&)
	#value_compare value_comp()


	cdef class UnicodeIntDict2:
	cdef unordered_map[cpp_string, int] _map

	def __init__(self):
	self._map = unordered_map[cpp_string, int]()

	def __setitem__(self, unicode key, int value):
	self._map[key.encode('utf8')] = value

	def __getitem__(self, unicode key):
	cdef unordered_map[cpp_string, int].iterator it = self._map.find(key.encode('utf8'))
	if it == self._map.end():
	raise KeyError(key)
	return deref(it).second

	def __contains__(self, unicode key):
	cdef unordered_map[cpp_string, int].iterator it = self._map.find(key.encode('utf8'))
	return not it == self._map.end()

	def __len__(self):
	return self._map.size()


	cdef class UnicodeIntDict:
	cdef cpp_map[cpp_string, int] _map

	def __init__(self):
	self._map = cpp_map[cpp_string, int]()

	def __setitem__(self, unicode key, int value):
	self._map[key.encode('utf8')] = value

	def __getitem__(self, unicode key):
	cdef cpp_map[cpp_string, int].iterator it = self._map.find(key.encode('utf8'))
	if it == self._map.end():
	raise KeyError(key)
	return deref(it).second

	def __contains__(self, unicode key):
	cdef cpp_map[cpp_string, int].iterator it = self._map.find(key.encode('utf8'))
	return not it == self._map.end()

	def __len__(self):
	return self._map.size()