Skip to content

Instantly share code, notes, and snippets.

@StevenReitsma
Last active January 17, 2018 07:39
Show Gist options
  • Save StevenReitsma/78d1078706ad52d2bd9073e46d4786e1 to your computer and use it in GitHub Desktop.
Save StevenReitsma/78d1078706ad52d2bd9073e46d4786e1 to your computer and use it in GitHub Desktop.
Blogpost-Changepoint-Detection-Snippet4
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, make_pipeline, FeatureUnion, make_union
from sklearn.ensemble import RandomForestRegressor
def get_categorical_glossary(x):
n_unique = len(np.unique(x[x.notnull()]))
codes = range(0, n_unique)
return dict(zip(np.unique(x), codes))
class ColumnSelector(BaseEstimator, TransformerMixin):
"""
Selects columns by name
"""
def __init__(self, colnames):
self.colnames = colnames
def fit(self, X, y=None):
return self
def transform(self, X, y=None):
return X[self.colnames]
class NumericEncoder(BaseEstimator, TransformerMixin):
"""
Replaces categoricals with integer indices.
We need this since the default scikit-learn LabelEncoder is not supported in Pipelines.
"""
def fit(self, X, y=None):
self.glossary = dict([(nm, get_categorical_glossary(X[nm])) for nm in X])
return self
def transform(self, X, y=None):
X_new = X.copy()
for nm in X:
# if missing in dict, sets to NaN
X_new[nm] = X[nm].map(self.glossary[nm])
return X_new
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment