rafbarr/_feature_hasher.pyx

## _feature_hasher.pyx
# distutils: language=c++

import collections
import operator

from cpython.bytes cimport PyBytes_AS_STRING, PyBytes_GET_SIZE
import cython
import numpy as np
import pandas as pd
from libc.stdint cimport uint8_t, uint32_t
from libcpp.vector cimport vector
from murmurhash cimport mrmr
from scipy.sparse import coo_matrix

from .helpers import fast_coo2csr


cdef inline uint32_t _murmur3_32(object key, uint32_t seed=0) except 0:
    """32-bit MurmurHash3 for bytes and unicode objects."""

    cdef bytes bkey
    if isinstance(key, unicode):
        bkey = (<unicode> key).encode('utf-8')
    elif isinstance(key, bytes):
        bkey = <bytes> key
    else:
        raise ValueError("the key %s must be either unicode or str" % key)
    return mrmr.hash32(PyBytes_AS_STRING(bkey), PyBytes_GET_SIZE(bkey), seed)


@cython.boundscheck(False)
@cython.wraparound(False)
cdef uint32_t[:] _murmur3_32_vec(object[:] keys, uint32_t[:] out, int seed=0):
    """Vectorized 32-bit MurmurHash3.

    :param keys: the keys to be hashed
    :param out: a pre-allocated uint32_t[:] array used to store the output
    :param seed: the MurmurHash3 seed
    :return: the out array
    """

    assert len(keys) == len(out)
    for i in range(len(keys)):
        out[i] = _murmur3_32(keys[i], seed)
    return out


cdef inline uint32_t _fnv1a_32(uint8_t* buf, size_t buf_size, uint32_t seed=0x811c9dc5):
    """32-bit FNV-1a hash."""

    cdef uint8_t* buf_curr = buf
    cdef uint8_t* buf_end = buf + buf_size
    while buf_curr < buf_end:
        seed ^= buf_curr[0]
        seed += (seed << 1) + (seed << 4) + (seed << 7) + (seed << 8) + (seed << 24)
        buf_curr += 1
    return seed


@cython.boundscheck(False)
@cython.wraparound(False)
cdef uint32_t[:] _combine_hash_arrays(list hash_arrays, uint32_t[:] out):
    """Combine hash arrays by taking the FNV-1a hash of their values.

    For instance, for the i-th entry of two arrays a and b, this will produce a final array
    out such that out[i] := fnv1a([a[i], b[i]]).

    :param hash_arrays: a list of uint32_t[:] arrays (of same size) containing hash values
    :param out: a pre-allocated uint32_t[:] array used to store the output
    :return: the out array
    """

    cdef size_t n_hash_arrays = len(hash_arrays)
    assert n_hash_arrays > 1
    cdef size_t n_elements = len(hash_arrays[0])
    assert n_elements == len(out)
    cdef uint32_t[:] hash_array
    cdef vector[uint32_t*] hash_array_ptrs
    for hash_array in hash_arrays:
        assert len(hash_array) == n_elements
        hash_array_ptrs.push_back(&hash_array[0])
    cdef vector[uint32_t] buf = vector[uint32_t](n_hash_arrays)
    cdef size_t n_buf_bytes = sizeof(uint32_t) * n_hash_arrays
    for i in range(n_elements):
        for j in range(n_hash_arrays):
            buf[j] = hash_array_ptrs[j][i]
        out[i] = _fnv1a_32(<uint8_t*> &buf[0], n_buf_bytes)
    return out


def transform(X, int n_bits, set cat_feats, set cont_feats, set crossed_feats, dtype=np.float32,
              bint unbiased=True):
    """Feature hashing transformation.

    For details, check this out: http://www.machinelearning.org/archive/icml2009/papers/407.pdf.

    :param X: a pandas DataFrame or a mapping from feature name to an array-like data structure
        containing the data to be transformed
    :param n_bits: the number of bits used to determine the size of the feature space
    :param cat_feats: a set with the names of the categorical features
    :param cont_feats: a set with the names of the continous features
    :param crossed_feats: a set with tuples indicating the crossed (i.e. interaction) features
    :param dtype: the data type of the output (should be numpy.float32 or numpy.float64)
    :param unbiased: if the sign of the hashes should be used to avoid the bias introduced by
        potential collisions
    :return: a scipy sparse CSR matrix with the result of the feature hashing transformation of X
    """

    assert isinstance(X, pd.DataFrame) or isinstance(X, collections.Mapping)
    n_samples = X.shape[0] if isinstance(X, pd.DataFrame) else len(X.values()[0])
    assert all(len(v) == n_samples for v in X.itervalues())
    n_features = len(cat_feats) + len(cont_feats) + len(crossed_feats)
    n_hashed_features = n_samples * n_features
    assert n_hashed_features > 0

    rows = np.tile(np.arange(n_samples, dtype=np.uint32), n_features)
    cols = np.empty(n_hashed_features, dtype=np.uint32)
    vals = np.ones(n_hashed_features, dtype=dtype)

    cols_per_feat, vals_per_feat = {}, {}
    for i, f in enumerate(sorted(cat_feats | cont_feats | crossed_feats)):
        cols_per_feat[f] = cols[i * n_samples:i * n_samples + n_samples]
        vals_per_feat[f] = vals[i * n_samples:i * n_samples + n_samples]

    for f in cat_feats:
        _murmur3_32_vec(
            np.array(X[f], copy=False, dtype=object),
            cols_per_feat[f],
            seed=_murmur3_32(f)
        )

    for f in cont_feats:
        cols_per_feat[f][:] = _murmur3_32(f)
        vals_per_feat[f][:] = np.array(X[f], copy=False, dtype=dtype)

    for cf in crossed_feats:
        # avoid expensive rehashing by combining the previously computed hashes
        _combine_hash_arrays([cols_per_feat[f] for f in cf], cols_per_feat[cf])
        vals_per_feat[cf][:] = reduce(operator.mul, [X[f] for f in cf if f in cont_feats], 1)

    # multiply the values by the sign of the hash to avoid bias
    if unbiased:
        vals *= np.sign(cols.astype(np.int32, copy=False))
    n_hash_buckets = 2 ** n_bits
    # map the hashes to the actual bucket indices
    cols &= n_hash_buckets - 1
    return fast_coo2csr(coo_matrix(
        (vals, (rows, cols)),
        (n_samples, n_hash_buckets)
    ))

## feature_hasher.py
import operator

import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin

from ._feature_hasher import transform


class FeatureHasher(BaseEstimator, TransformerMixin):
    """Feature hashing transformer.

    Feature hashing is a technique for controlling the memory footprint of the feature engineering
    module of a machine learning system. The typical approach of one-hot-encoding of categorical
    variables requires a dictionary mapping strings to indices, which normally requires large
    amounts of memory.

    Feature hashing, on the other hand, simply requires the storage of the hash function. Each
    feature is passed through the hash function to find the corresponding index in the feature
    vector, which is of predetermined size. Collisions are bound to happen, however, it's not as big
    of a deal as one might think [1].

    References
    ==========

    1. Weinberger, Kilian, et al. "Feature hashing for large scale multitask learning." Proceedings
    of the 26th Annual International Conference on Machine Learning. ACM, 2009.

    2. Wikipedia contributors. "Feature hashing." Wikipedia, The Free Encyclopedia. Wikipedia, The
    Free Encyclopedia, 15 Jan. 2015. Web. 2 Jun. 2015.
    """

    def __init__(self, n_bits=22, cat_feats=None, cont_feats=None, crossed_feats=None,
                 dtype=np.float32, unbiased=True):
        assert 0 < n_bits < 32

        self.n_bits_ = n_bits
        self.cat_feats_ = set(cat_feats or [])
        self.cont_feats_ = set(cont_feats or [])
        self.crossed_feats_ = set(crossed_feats or [])
        self.dtype_ = dtype
        self.unbiased_ = unbiased

        assert len(self.cat_feats_ | self.cont_feats_) > 0
        assert len(self.cat_feats_ & self.cont_feats_) == 0
        assert reduce(
            operator.or_, map(set, self.crossed_feats_), set()
        ).issubset(self.cat_feats_ | self.cont_feats_)

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return transform(
            X, self.n_bits_, self.cat_feats_, self.cont_feats_, self.crossed_feats_, self.dtype_,
            self.unbiased_
        )

## helpers.py
import numpy as np
from scipy.sparse import csr_matrix
from scipy.sparse.sputils import get_index_dtype, upcast
from scipy.sparse._sparsetools import coo_tocsr


def fast_coo2csr(X):
    assert X.format == 'coo'
    m, n = X.shape
    idx_dtype = get_index_dtype((X.row, X.col), maxval=max(X.nnz, n))
    row = X.row.astype(idx_dtype, copy=False)
    col = X.col.astype(idx_dtype, copy=False)
    # allocate space for CSR matrix
    indptr = np.empty(m + 1, dtype=idx_dtype)
    indices = np.empty_like(col, dtype=idx_dtype)
    data = np.empty_like(X.data, dtype=upcast(X.dtype))
    # do conversion
    coo_tocsr(m, n, X.nnz, row, col, X.data, indptr, indices, data)
    X_csr = csr_matrix((data, indices, indptr), shape=X.shape)
    X_csr.sum_duplicates()
    return X_csr
	# distutils: language=c++

	import collections
	import operator

	from cpython.bytes cimport PyBytes_AS_STRING, PyBytes_GET_SIZE
	import cython
	import numpy as np
	import pandas as pd
	from libc.stdint cimport uint8_t, uint32_t
	from libcpp.vector cimport vector
	from murmurhash cimport mrmr
	from scipy.sparse import coo_matrix

	from .helpers import fast_coo2csr


	cdef inline uint32_t _murmur3_32(object key, uint32_t seed=0) except 0:
	"""32-bit MurmurHash3 for bytes and unicode objects."""

	cdef bytes bkey
	if isinstance(key, unicode):
	bkey = (<unicode> key).encode('utf-8')
	elif isinstance(key, bytes):
	bkey = <bytes> key
	else:
	raise ValueError("the key %s must be either unicode or str" % key)
	return mrmr.hash32(PyBytes_AS_STRING(bkey), PyBytes_GET_SIZE(bkey), seed)


	@cython.boundscheck(False)
	@cython.wraparound(False)
	cdef uint32_t[:] _murmur3_32_vec(object[:] keys, uint32_t[:] out, int seed=0):
	"""Vectorized 32-bit MurmurHash3.

	:param keys: the keys to be hashed
	:param out: a pre-allocated uint32_t[:] array used to store the output
	:param seed: the MurmurHash3 seed
	:return: the out array
	"""

	assert len(keys) == len(out)
	for i in range(len(keys)):
	out[i] = _murmur3_32(keys[i], seed)
	return out


	cdef inline uint32_t _fnv1a_32(uint8_t* buf, size_t buf_size, uint32_t seed=0x811c9dc5):
	"""32-bit FNV-1a hash."""

	cdef uint8_t* buf_curr = buf
	cdef uint8_t* buf_end = buf + buf_size
	while buf_curr < buf_end:
	seed ^= buf_curr[0]
	seed += (seed << 1) + (seed << 4) + (seed << 7) + (seed << 8) + (seed << 24)
	buf_curr += 1
	return seed


	@cython.boundscheck(False)
	@cython.wraparound(False)
	cdef uint32_t[:] _combine_hash_arrays(list hash_arrays, uint32_t[:] out):
	"""Combine hash arrays by taking the FNV-1a hash of their values.

	For instance, for the i-th entry of two arrays a and b, this will produce a final array
	out such that out[i] := fnv1a([a[i], b[i]]).

	:param hash_arrays: a list of uint32_t[:] arrays (of same size) containing hash values
	:param out: a pre-allocated uint32_t[:] array used to store the output
	:return: the out array
	"""

	cdef size_t n_hash_arrays = len(hash_arrays)
	assert n_hash_arrays > 1
	cdef size_t n_elements = len(hash_arrays[0])
	assert n_elements == len(out)
	cdef uint32_t[:] hash_array
	cdef vector[uint32_t*] hash_array_ptrs
	for hash_array in hash_arrays:
	assert len(hash_array) == n_elements
	hash_array_ptrs.push_back(&hash_array[0])
	cdef vector[uint32_t] buf = vector[uint32_t](n_hash_arrays)
	cdef size_t n_buf_bytes = sizeof(uint32_t) * n_hash_arrays
	for i in range(n_elements):
	for j in range(n_hash_arrays):
	buf[j] = hash_array_ptrs[j][i]
	out[i] = _fnv1a_32(<uint8_t*> &buf[0], n_buf_bytes)
	return out


	def transform(X, int n_bits, set cat_feats, set cont_feats, set crossed_feats, dtype=np.float32,
	bint unbiased=True):
	"""Feature hashing transformation.

	For details, check this out: http://www.machinelearning.org/archive/icml2009/papers/407.pdf.

	:param X: a pandas DataFrame or a mapping from feature name to an array-like data structure
	containing the data to be transformed
	:param n_bits: the number of bits used to determine the size of the feature space
	:param cat_feats: a set with the names of the categorical features
	:param cont_feats: a set with the names of the continous features
	:param crossed_feats: a set with tuples indicating the crossed (i.e. interaction) features
	:param dtype: the data type of the output (should be numpy.float32 or numpy.float64)
	:param unbiased: if the sign of the hashes should be used to avoid the bias introduced by
	potential collisions
	:return: a scipy sparse CSR matrix with the result of the feature hashing transformation of X
	"""

	assert isinstance(X, pd.DataFrame) or isinstance(X, collections.Mapping)
	n_samples = X.shape[0] if isinstance(X, pd.DataFrame) else len(X.values()[0])
	assert all(len(v) == n_samples for v in X.itervalues())
	n_features = len(cat_feats) + len(cont_feats) + len(crossed_feats)
	n_hashed_features = n_samples * n_features
	assert n_hashed_features > 0

	rows = np.tile(np.arange(n_samples, dtype=np.uint32), n_features)
	cols = np.empty(n_hashed_features, dtype=np.uint32)
	vals = np.ones(n_hashed_features, dtype=dtype)

	cols_per_feat, vals_per_feat = {}, {}
	for i, f in enumerate(sorted(cat_feats \| cont_feats \| crossed_feats)):
	cols_per_feat[f] = cols[i * n_samples:i * n_samples + n_samples]
	vals_per_feat[f] = vals[i * n_samples:i * n_samples + n_samples]

	for f in cat_feats:
	_murmur3_32_vec(
	np.array(X[f], copy=False, dtype=object),
	cols_per_feat[f],
	seed=_murmur3_32(f)
	)

	for f in cont_feats:
	cols_per_feat[f][:] = _murmur3_32(f)
	vals_per_feat[f][:] = np.array(X[f], copy=False, dtype=dtype)

	for cf in crossed_feats:
	# avoid expensive rehashing by combining the previously computed hashes
	_combine_hash_arrays([cols_per_feat[f] for f in cf], cols_per_feat[cf])
	vals_per_feat[cf][:] = reduce(operator.mul, [X[f] for f in cf if f in cont_feats], 1)

	# multiply the values by the sign of the hash to avoid bias
	if unbiased:
	vals *= np.sign(cols.astype(np.int32, copy=False))
	n_hash_buckets = 2 ** n_bits
	# map the hashes to the actual bucket indices
	cols &= n_hash_buckets - 1
	return fast_coo2csr(coo_matrix(
	(vals, (rows, cols)),
	(n_samples, n_hash_buckets)
	))
	import operator

	import numpy as np
	from sklearn.base import BaseEstimator, TransformerMixin

	from ._feature_hasher import transform


	class FeatureHasher(BaseEstimator, TransformerMixin):
	"""Feature hashing transformer.

	Feature hashing is a technique for controlling the memory footprint of the feature engineering
	module of a machine learning system. The typical approach of one-hot-encoding of categorical
	variables requires a dictionary mapping strings to indices, which normally requires large
	amounts of memory.

	Feature hashing, on the other hand, simply requires the storage of the hash function. Each
	feature is passed through the hash function to find the corresponding index in the feature
	vector, which is of predetermined size. Collisions are bound to happen, however, it's not as big
	of a deal as one might think [1].

	References
	==========

	1. Weinberger, Kilian, et al. "Feature hashing for large scale multitask learning." Proceedings
	of the 26th Annual International Conference on Machine Learning. ACM, 2009.

	2. Wikipedia contributors. "Feature hashing." Wikipedia, The Free Encyclopedia. Wikipedia, The
	Free Encyclopedia, 15 Jan. 2015. Web. 2 Jun. 2015.
	"""

	def __init__(self, n_bits=22, cat_feats=None, cont_feats=None, crossed_feats=None,
	dtype=np.float32, unbiased=True):
	assert 0 < n_bits < 32

	self.n_bits_ = n_bits
	self.cat_feats_ = set(cat_feats or [])
	self.cont_feats_ = set(cont_feats or [])
	self.crossed_feats_ = set(crossed_feats or [])
	self.dtype_ = dtype
	self.unbiased_ = unbiased

	assert len(self.cat_feats_ \| self.cont_feats_) > 0
	assert len(self.cat_feats_ & self.cont_feats_) == 0
	assert reduce(
	operator.or_, map(set, self.crossed_feats_), set()
	).issubset(self.cat_feats_ \| self.cont_feats_)

	def fit(self, X, y=None):
	return self

	def transform(self, X):
	return transform(
	X, self.n_bits_, self.cat_feats_, self.cont_feats_, self.crossed_feats_, self.dtype_,
	self.unbiased_
	)
	import numpy as np
	from scipy.sparse import csr_matrix
	from scipy.sparse.sputils import get_index_dtype, upcast
	from scipy.sparse._sparsetools import coo_tocsr


	def fast_coo2csr(X):
	assert X.format == 'coo'
	m, n = X.shape
	idx_dtype = get_index_dtype((X.row, X.col), maxval=max(X.nnz, n))
	row = X.row.astype(idx_dtype, copy=False)
	col = X.col.astype(idx_dtype, copy=False)
	# allocate space for CSR matrix
	indptr = np.empty(m + 1, dtype=idx_dtype)
	indices = np.empty_like(col, dtype=idx_dtype)
	data = np.empty_like(X.data, dtype=upcast(X.dtype))
	# do conversion
	coo_tocsr(m, n, X.nnz, row, col, X.data, indptr, indices, data)
	X_csr = csr_matrix((data, indices, indptr), shape=X.shape)
	X_csr.sum_duplicates()
	return X_csr