Skip to content

Instantly share code, notes, and snippets.

@yankov
Created February 3, 2015 19:47
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save yankov/a86d44bf0e6009944c21 to your computer and use it in GitHub Desktop.
Save yankov/a86d44bf0e6009944c21 to your computer and use it in GitHub Desktop.
partial onhotencoder
from scipy import sparse
class OneHotEncoder():
"""
OneHotEncoder takes data matrix with categorical columns and
converts it to a sparse binary matrix doing one-of-k encoding.
Parts of code borrowed from Paul Duan (www.paulduan.com)
Licence: MIT (https://github.com/pyduan/amazonaccess/blob/master/MIT-LICENSE)
"""
def __init__(self):
self.keymap = None
def fit(self, x):
self.keymap = []
for col in x.T:
uniques = set(list(col))
self.keymap.append(dict((key, i) for i, key in enumerate(uniques)))
def partial_fit(self, x):
"""
This method can be used for doing one hot encoding in mini-batch mode.
"""
if self.keymap is None:
self.fit(x)
else:
for i, col in enumerate(x.T):
uniques = set(self.keymap[i].keys() + (list(col)))
self.keymap[i] = dict((key, i) for i, key in enumerate(uniques))
def transform(self, x):
if self.keymap is None:
self.fit(x)
outdat = []
for i, col in enumerate(x.T):
km = self.keymap[i]
num_labels = len(km)
spmat = sparse.lil_matrix((x.shape[0], num_labels))
for j, val in enumerate(col):
if val in km:
spmat[j, km[val]] = 1
outdat.append(spmat)
outdat = sparse.hstack(outdat).tocsr()
return outdat
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment