Last active
June 20, 2020 20:39
-
-
Save adamnovotnycom/dda3bad52112f64d1ef2d136eeb66b4e to your computer and use it in GitHub Desktop.
sklearn pipeline
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# train_dataset: pd.DataFrame | |
# Columns | |
# numerical_features = ["close_percent_bb", "macd_diff", "force_index"] , | |
# type: float | |
# categorical_features = ["previous_day_flag"] | |
# type: str | |
# value from set {"up", "down", "flat"} | |
# Example: | |
# >>train_dataset.loc[:, numerical_features + categorical_features + ["label"]].head(5) | |
# percent_bb macd_diff force_index previous_day_flag label | |
# 0.966666 0.065271 1.907922e+07 down 1.0 | |
# 0.955114 0.098365 1.985671e+07 up 1.0 | |
# 0.911741 0.111657 1.868552e+07 flat 0.0 | |
# 0.951083 0.132366 2.178419e+07 flat 1.0 | |
# 0.963766 0.151140 2.343282e+07 flat 0.0 | |
import pandas as pd | |
from sklearn.base import BaseEstimator, TransformerMixin | |
from sklearn.impute import SimpleImputer | |
from sklearn.pipeline import FeatureUnion, Pipeline | |
from sklearn.preprocessing import StandardScaler, OneHotEncoder | |
class FeatureSelector(BaseEstimator, TransformerMixin): | |
def __init__(self, feature_names): | |
self.feature_names = feature_names | |
def fit( self, X, y = None ): | |
return self | |
def transform(self, X, y=None): | |
return X[self.feature_names] | |
numerical_pipeline = Pipeline(steps = [ | |
("num_selector", FeatureSelector(numerical_features)), | |
("impute_median", SimpleImputer(strategy="median")), | |
("std_scaler", StandardScaler()) | |
]) | |
categorical_pipeline = Pipeline(steps = [ | |
("num_selector", FeatureSelector(categorical_features)), | |
("ohe", OneHotEncoder( | |
handle_unknown="ignore", | |
sparse=False, | |
categories=[ | |
["up", "down"], | |
# second cat... | |
]) | |
) | |
]) | |
feature_pipeline = FeatureUnion(n_jobs=1, transformer_list=[ | |
("numerical_pipeline", numerical_pipeline), | |
("categorical_pipeline", categorical_pipeline), | |
]) | |
full_pipeline = feature_pipeline.fit( | |
train_dataset.loc[:, numerical_features + categorical_features], | |
train_dataset["label"] | |
) | |
train_dataset_transformed = feature_pipeline.transform( | |
train_dataset.loc[:, numerical_features + categorical_features] | |
) | |
test_dataset_transformed = feature_pipeline.transform(test_dataset.loc[ | |
:, | |
numerical_features + categorical_features] | |
) | |
assert len(train_dataset_transformed) == len(train_dataset) | |
assert len(test_dataset_transformed) == len(test_dataset) | |
transformed_categories = [list(x) for x in full_pipeline.transformer_list[1][1].steps[1][1].categories_] # list of lists | |
transformed_categories = [val for lst in transformed_categories for val in lst] # flatten list of lists | |
transformed_categories | |
>> pd.DataFrame( | |
>> train_dataset_transformed, | |
>> columns = numerical_features + transformed_categories | |
>> ) | |
# close_percent_bb macd_diff force_index up down | |
# 1.108006 0.164807 0.418935 0.0 1.0 | |
# 1.072238 0.243592 0.430490 1.0 0.0 | |
# 0.937938 0.275237 0.413084 0.0 0.0 | |
# 1.059756 0.324540 0.459135 0.0 0.0 | |
# 1.099027 0.369234 0.483637 0.0 0.0 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment