Skip to content

Instantly share code, notes, and snippets.

@rafbarr
Created October 2, 2016 09:58
Show Gist options
  • Save rafbarr/40eccb8158c3c2ea2fedfcffd6cc9374 to your computer and use it in GitHub Desktop.
Save rafbarr/40eccb8158c3c2ea2fedfcffd6cc9374 to your computer and use it in GitHub Desktop.
Very efficient implementation of feature hashing for Python.
# distutils: language=c++
import collections
import operator
from cpython.bytes cimport PyBytes_AS_STRING, PyBytes_GET_SIZE
import cython
import numpy as np
import pandas as pd
from libc.stdint cimport uint8_t, uint32_t
from libcpp.vector cimport vector
from murmurhash cimport mrmr
from scipy.sparse import coo_matrix
from .helpers import fast_coo2csr
cdef inline uint32_t _murmur3_32(object key, uint32_t seed=0) except 0:
"""32-bit MurmurHash3 for bytes and unicode objects."""
cdef bytes bkey
if isinstance(key, unicode):
bkey = (<unicode> key).encode('utf-8')
elif isinstance(key, bytes):
bkey = <bytes> key
else:
raise ValueError("the key %s must be either unicode or str" % key)
return mrmr.hash32(PyBytes_AS_STRING(bkey), PyBytes_GET_SIZE(bkey), seed)
@cython.boundscheck(False)
@cython.wraparound(False)
cdef uint32_t[:] _murmur3_32_vec(object[:] keys, uint32_t[:] out, int seed=0):
"""Vectorized 32-bit MurmurHash3.
:param keys: the keys to be hashed
:param out: a pre-allocated uint32_t[:] array used to store the output
:param seed: the MurmurHash3 seed
:return: the out array
"""
assert len(keys) == len(out)
for i in range(len(keys)):
out[i] = _murmur3_32(keys[i], seed)
return out
cdef inline uint32_t _fnv1a_32(uint8_t* buf, size_t buf_size, uint32_t seed=0x811c9dc5):
"""32-bit FNV-1a hash."""
cdef uint8_t* buf_curr = buf
cdef uint8_t* buf_end = buf + buf_size
while buf_curr < buf_end:
seed ^= buf_curr[0]
seed += (seed << 1) + (seed << 4) + (seed << 7) + (seed << 8) + (seed << 24)
buf_curr += 1
return seed
@cython.boundscheck(False)
@cython.wraparound(False)
cdef uint32_t[:] _combine_hash_arrays(list hash_arrays, uint32_t[:] out):
"""Combine hash arrays by taking the FNV-1a hash of their values.
For instance, for the i-th entry of two arrays a and b, this will produce a final array
out such that out[i] := fnv1a([a[i], b[i]]).
:param hash_arrays: a list of uint32_t[:] arrays (of same size) containing hash values
:param out: a pre-allocated uint32_t[:] array used to store the output
:return: the out array
"""
cdef size_t n_hash_arrays = len(hash_arrays)
assert n_hash_arrays > 1
cdef size_t n_elements = len(hash_arrays[0])
assert n_elements == len(out)
cdef uint32_t[:] hash_array
cdef vector[uint32_t*] hash_array_ptrs
for hash_array in hash_arrays:
assert len(hash_array) == n_elements
hash_array_ptrs.push_back(&hash_array[0])
cdef vector[uint32_t] buf = vector[uint32_t](n_hash_arrays)
cdef size_t n_buf_bytes = sizeof(uint32_t) * n_hash_arrays
for i in range(n_elements):
for j in range(n_hash_arrays):
buf[j] = hash_array_ptrs[j][i]
out[i] = _fnv1a_32(<uint8_t*> &buf[0], n_buf_bytes)
return out
def transform(X, int n_bits, set cat_feats, set cont_feats, set crossed_feats, dtype=np.float32,
bint unbiased=True):
"""Feature hashing transformation.
For details, check this out: http://www.machinelearning.org/archive/icml2009/papers/407.pdf.
:param X: a pandas DataFrame or a mapping from feature name to an array-like data structure
containing the data to be transformed
:param n_bits: the number of bits used to determine the size of the feature space
:param cat_feats: a set with the names of the categorical features
:param cont_feats: a set with the names of the continous features
:param crossed_feats: a set with tuples indicating the crossed (i.e. interaction) features
:param dtype: the data type of the output (should be numpy.float32 or numpy.float64)
:param unbiased: if the sign of the hashes should be used to avoid the bias introduced by
potential collisions
:return: a scipy sparse CSR matrix with the result of the feature hashing transformation of X
"""
assert isinstance(X, pd.DataFrame) or isinstance(X, collections.Mapping)
n_samples = X.shape[0] if isinstance(X, pd.DataFrame) else len(X.values()[0])
assert all(len(v) == n_samples for v in X.itervalues())
n_features = len(cat_feats) + len(cont_feats) + len(crossed_feats)
n_hashed_features = n_samples * n_features
assert n_hashed_features > 0
rows = np.tile(np.arange(n_samples, dtype=np.uint32), n_features)
cols = np.empty(n_hashed_features, dtype=np.uint32)
vals = np.ones(n_hashed_features, dtype=dtype)
cols_per_feat, vals_per_feat = {}, {}
for i, f in enumerate(sorted(cat_feats | cont_feats | crossed_feats)):
cols_per_feat[f] = cols[i * n_samples:i * n_samples + n_samples]
vals_per_feat[f] = vals[i * n_samples:i * n_samples + n_samples]
for f in cat_feats:
_murmur3_32_vec(
np.array(X[f], copy=False, dtype=object),
cols_per_feat[f],
seed=_murmur3_32(f)
)
for f in cont_feats:
cols_per_feat[f][:] = _murmur3_32(f)
vals_per_feat[f][:] = np.array(X[f], copy=False, dtype=dtype)
for cf in crossed_feats:
# avoid expensive rehashing by combining the previously computed hashes
_combine_hash_arrays([cols_per_feat[f] for f in cf], cols_per_feat[cf])
vals_per_feat[cf][:] = reduce(operator.mul, [X[f] for f in cf if f in cont_feats], 1)
# multiply the values by the sign of the hash to avoid bias
if unbiased:
vals *= np.sign(cols.astype(np.int32, copy=False))
n_hash_buckets = 2 ** n_bits
# map the hashes to the actual bucket indices
cols &= n_hash_buckets - 1
return fast_coo2csr(coo_matrix(
(vals, (rows, cols)),
(n_samples, n_hash_buckets)
))
import operator
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from ._feature_hasher import transform
class FeatureHasher(BaseEstimator, TransformerMixin):
"""Feature hashing transformer.
Feature hashing is a technique for controlling the memory footprint of the feature engineering
module of a machine learning system. The typical approach of one-hot-encoding of categorical
variables requires a dictionary mapping strings to indices, which normally requires large
amounts of memory.
Feature hashing, on the other hand, simply requires the storage of the hash function. Each
feature is passed through the hash function to find the corresponding index in the feature
vector, which is of predetermined size. Collisions are bound to happen, however, it's not as big
of a deal as one might think [1].
References
==========
1. Weinberger, Kilian, et al. "Feature hashing for large scale multitask learning." Proceedings
of the 26th Annual International Conference on Machine Learning. ACM, 2009.
2. Wikipedia contributors. "Feature hashing." Wikipedia, The Free Encyclopedia. Wikipedia, The
Free Encyclopedia, 15 Jan. 2015. Web. 2 Jun. 2015.
"""
def __init__(self, n_bits=22, cat_feats=None, cont_feats=None, crossed_feats=None,
dtype=np.float32, unbiased=True):
assert 0 < n_bits < 32
self.n_bits_ = n_bits
self.cat_feats_ = set(cat_feats or [])
self.cont_feats_ = set(cont_feats or [])
self.crossed_feats_ = set(crossed_feats or [])
self.dtype_ = dtype
self.unbiased_ = unbiased
assert len(self.cat_feats_ | self.cont_feats_) > 0
assert len(self.cat_feats_ & self.cont_feats_) == 0
assert reduce(
operator.or_, map(set, self.crossed_feats_), set()
).issubset(self.cat_feats_ | self.cont_feats_)
def fit(self, X, y=None):
return self
def transform(self, X):
return transform(
X, self.n_bits_, self.cat_feats_, self.cont_feats_, self.crossed_feats_, self.dtype_,
self.unbiased_
)
import numpy as np
from scipy.sparse import csr_matrix
from scipy.sparse.sputils import get_index_dtype, upcast
from scipy.sparse._sparsetools import coo_tocsr
def fast_coo2csr(X):
assert X.format == 'coo'
m, n = X.shape
idx_dtype = get_index_dtype((X.row, X.col), maxval=max(X.nnz, n))
row = X.row.astype(idx_dtype, copy=False)
col = X.col.astype(idx_dtype, copy=False)
# allocate space for CSR matrix
indptr = np.empty(m + 1, dtype=idx_dtype)
indices = np.empty_like(col, dtype=idx_dtype)
data = np.empty_like(X.data, dtype=upcast(X.dtype))
# do conversion
coo_tocsr(m, n, X.nnz, row, col, X.data, indptr, indices, data)
X_csr = csr_matrix((data, indices, indptr), shape=X.shape)
X_csr.sum_duplicates()
return X_csr
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment