Skip to content

Instantly share code, notes, and snippets.

@psinger
Last active August 3, 2018 08:17
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
Star You must be signed in to star a gist
Save psinger/ef4592492dc8edf101130f0bf32f5ff9 to your computer and use it in GitHub Desktop.
from sklearn.base import BaseEstimator, TransformerMixin
from pandas.api.types import CategoricalDtype
import pandas as pd
class DummyEncoder(BaseEstimator, TransformerMixin):
def __init__(self, min_frequency=1, dummy_na=True):
self.min_frequency = min_frequency
self.dummy_na = dummy_na
self.categories = dict()
self.features = []
def fit(self, X):
for col in X.columns:
counts = pd.value_counts(X[col])
self.categories[col] = list(set(counts[counts >= self.min_frequency].index.tolist()))
return self
def transform(self, X, *_):
for col in X.columns:
X = X.astype({col: CategoricalDtype(self.categories[col], ordered=True)})
ret = pd.get_dummies(X, dummy_na=self.dummy_na)
self.features = ret.columns
return ret
def get_feature_names(self):
return self.features
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment