Skip to content

Instantly share code, notes, and snippets.

@sergeyf
Last active April 4, 2016 22:54
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save sergeyf/c66d592d24a7ee5c125b to your computer and use it in GitHub Desktop.
Save sergeyf/c66d592d24a7ee5c125b to your computer and use it in GitHub Desktop.
import numpy as np
from scipy.spatial.distance import pdist, squareform
# function that converts categorical variable
# into a one-hot encoding
def one_hot_encoding(x):
n = len(x)
min_category = np.min(x)
max_category = np.max(x)
num_categories = max_category - min_category + 1
output = np.zeros((n, num_categories))
for i,x_i in enumerate(x):
output[i, x_i - min_category] = 1
return output
# function that converts categorical variable
# into a random gaussian encoding of length
def gaussian_encoding(x,gaussian_encodings=None):
n = len(x)
min_category = np.min(x)
max_category = np.max(x)
num_categories = max_category - min_category + 1
encoding_dimension = int(np.log(num_categories))
output = np.zeros((n, encoding_dimension))
if gaussian_encodings is None:
gaussian_encodings = dict()
for i,x_i in enumerate(x):
if x_i not in gaussian_encodings:
gaussian_encodings[x_i] = np.random.normal(loc=0,scale=1,size=encoding_dimension)
output[i,:] = gaussian_encodings[x_i]
return output, gaussian_encodings
# test data
n = 1000
num_categories = 100
x = np.random.randint(low=0,high=num_categories,size=n)
# get the encodings
x_one_hot = one_hot_encoding(x)
x_gaussian, gaussian_encodings = gaussian_encoding(x)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment