Last active
January 17, 2018 07:39
-
-
Save StevenReitsma/78d1078706ad52d2bd9073e46d4786e1 to your computer and use it in GitHub Desktop.
Blogpost-Changepoint-Detection-Snippet4
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from sklearn.base import BaseEstimator, TransformerMixin | |
from sklearn.pipeline import Pipeline, make_pipeline, FeatureUnion, make_union | |
from sklearn.ensemble import RandomForestRegressor | |
def get_categorical_glossary(x): | |
n_unique = len(np.unique(x[x.notnull()])) | |
codes = range(0, n_unique) | |
return dict(zip(np.unique(x), codes)) | |
class ColumnSelector(BaseEstimator, TransformerMixin): | |
""" | |
Selects columns by name | |
""" | |
def __init__(self, colnames): | |
self.colnames = colnames | |
def fit(self, X, y=None): | |
return self | |
def transform(self, X, y=None): | |
return X[self.colnames] | |
class NumericEncoder(BaseEstimator, TransformerMixin): | |
""" | |
Replaces categoricals with integer indices. | |
We need this since the default scikit-learn LabelEncoder is not supported in Pipelines. | |
""" | |
def fit(self, X, y=None): | |
self.glossary = dict([(nm, get_categorical_glossary(X[nm])) for nm in X]) | |
return self | |
def transform(self, X, y=None): | |
X_new = X.copy() | |
for nm in X: | |
# if missing in dict, sets to NaN | |
X_new[nm] = X[nm].map(self.glossary[nm]) | |
return X_new |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment