Skip to content

Instantly share code, notes, and snippets.

@AntonOsika
Last active April 8, 2018 14:04
Show Gist options
  • Save AntonOsika/cadee92254cde2e4b0bebbd0dd2012d3 to your computer and use it in GitHub Desktop.
Save AntonOsika/cadee92254cde2e4b0bebbd0dd2012d3 to your computer and use it in GitHub Desktop.
import bisect
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.validation import check_is_fitted
from sklearn.utils import column_or_1d
import numpy as np
class CategoricalTransform(LabelEncoder):
"""
Encode labels with value between 0 and n_classes-1.
Handles unseen labels and will treat everything as strings.
# Arguments:
min_category_size: integer, number of samples necessary to form a
separate class when fitting
Read more in sklearn :ref:`User Guide <preprocessing_targets>`.
"""
def __init__(self, min_category_size=None):
super(CategoricalTransform, self).__init__()
self.min_category_size = min_category_size
def fit(self, y):
"""Fit label encoder
Parameters
----------
y : array-like of shape (n_samples,)
Target values.
Returns
-------
self : returns an instance of self.
"""
y = np.array(y)
y = y.astype('str')
classes, counts = np.unique(y, return_counts=True)
if self.min_category_size:
large_classes = (counts >= self.min_category_size)
classes = classes[large_classes]
classes = classes.tolist()
bisect.insort_left(classes, '<unknown>')
self.classes_ = classes
return self
def transform(self, y):
"""Transform labels to normalized encoding.
Parameters
----------
y : array-like of shape [n_samples]
Target values.
Returns
-------
y : array-like of shape [n_samples]
"""
check_is_fitted(self, 'classes_')
y = column_or_1d(y, warn=True)
y = y.astype('str')
y[~np.isin(y, self.classes_)] = '<unknown>'
return np.searchsorted(self.classes_, y)
def fit_transform(self, y):
"""Fit and transform."""
self.fit(y)
return self.transform(y)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment