Skip to content

Instantly share code, notes, and snippets.

@adamnovotnycom
Last active June 20, 2020 20:39
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save adamnovotnycom/dda3bad52112f64d1ef2d136eeb66b4e to your computer and use it in GitHub Desktop.
Save adamnovotnycom/dda3bad52112f64d1ef2d136eeb66b4e to your computer and use it in GitHub Desktop.
sklearn pipeline
# train_dataset: pd.DataFrame
# Columns
# numerical_features = ["close_percent_bb", "macd_diff", "force_index"] ,
# type: float
# categorical_features = ["previous_day_flag"]
# type: str
# value from set {"up", "down", "flat"}
# Example:
# >>train_dataset.loc[:, numerical_features + categorical_features + ["label"]].head(5)
# percent_bb macd_diff force_index previous_day_flag label
# 0.966666 0.065271 1.907922e+07 down 1.0
# 0.955114 0.098365 1.985671e+07 up 1.0
# 0.911741 0.111657 1.868552e+07 flat 0.0
# 0.951083 0.132366 2.178419e+07 flat 1.0
# 0.963766 0.151140 2.343282e+07 flat 0.0
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
class FeatureSelector(BaseEstimator, TransformerMixin):
def __init__(self, feature_names):
self.feature_names = feature_names
def fit( self, X, y = None ):
return self
def transform(self, X, y=None):
return X[self.feature_names]
numerical_pipeline = Pipeline(steps = [
("num_selector", FeatureSelector(numerical_features)),
("impute_median", SimpleImputer(strategy="median")),
("std_scaler", StandardScaler())
])
categorical_pipeline = Pipeline(steps = [
("num_selector", FeatureSelector(categorical_features)),
("ohe", OneHotEncoder(
handle_unknown="ignore",
sparse=False,
categories=[
["up", "down"],
# second cat...
])
)
])
feature_pipeline = FeatureUnion(n_jobs=1, transformer_list=[
("numerical_pipeline", numerical_pipeline),
("categorical_pipeline", categorical_pipeline),
])
full_pipeline = feature_pipeline.fit(
train_dataset.loc[:, numerical_features + categorical_features],
train_dataset["label"]
)
train_dataset_transformed = feature_pipeline.transform(
train_dataset.loc[:, numerical_features + categorical_features]
)
test_dataset_transformed = feature_pipeline.transform(test_dataset.loc[
:,
numerical_features + categorical_features]
)
assert len(train_dataset_transformed) == len(train_dataset)
assert len(test_dataset_transformed) == len(test_dataset)
transformed_categories = [list(x) for x in full_pipeline.transformer_list[1][1].steps[1][1].categories_] # list of lists
transformed_categories = [val for lst in transformed_categories for val in lst] # flatten list of lists
transformed_categories
>> pd.DataFrame(
>> train_dataset_transformed,
>> columns = numerical_features + transformed_categories
>> )
# close_percent_bb macd_diff force_index up down
# 1.108006 0.164807 0.418935 0.0 1.0
# 1.072238 0.243592 0.430490 1.0 0.0
# 0.937938 0.275237 0.413084 0.0 0.0
# 1.059756 0.324540 0.459135 0.0 0.0
# 1.099027 0.369234 0.483637 0.0 0.0
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment