Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
An improvement over the CompressedFeatures class introduced at http://derandomized.com/post/51709771229/compressed-features-for-machine-learning#disqus_thread by not requiring the key->component mapping to be stored.
class DeterministicCompressedFeatures(CompressedFeatures):
"""Generates random components after seeding with the component_key.
By using a known seed to generate the random components, we do not need to
store or manage them. We can just recompute them whenever we need.
"""
def __init__(self, num_features=RANDOM_FEATURE_LENGTH):
super(DeterministicallyRandomFeatures, self).__init__(num_features)
# We can't use a defaultdict, because we must pass a parameter to
# _generate_component()
self.random_components = {}
def _generate_component(self, component_key):
# We must use hashlib, because hash() is unreliable
# http://stackoverflow.com/questions/793761/built-in-python-hash-function
big_hash = int(hashlib.md5(repr(component_key)).hexdigest(), 16)
# Shrink the 39-digit hash to something seed() will accept
lil_hash = int(big_hash % ((1 << 31) - 1))
np.random.seed(lil_hash)
# Deterministically compute the feature vector, based on the key
rv = np.random.randn(self.num_features, 1)
rv /= np.sqrt(np.dot(rv.T, rv)) # normalize to unit length
return rv
def increment_component(self, component_key, scale=1.0):
"""Increments feature_vector by the specified component.
Arguments:
component_key - The component to increment by. If this key has
never been seen before, the component values are generated.
scale - The multiplicative factor to apply against the
component values.
"""
if not self.contains_component(component_key):
if self.dynamic_mode:
self.random_components[component_key] = (
self._generate_component(component_key))
else:
return False
self.feature_vector += scale * self.random_components[component_key]
return True
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment