Skip to content

Instantly share code, notes, and snippets.

@fernandojunior
Created September 8, 2022 05:58
Show Gist options
  • Save fernandojunior/4cd5055fe91a69d80b35108e1b04af53 to your computer and use it in GitHub Desktop.
Save fernandojunior/4cd5055fe91a69d80b35108e1b04af53 to your computer and use it in GitHub Desktop.
Use OOP and functional programming to create data pipelines with sklearn, classes and pure functions
from typing import Any, Callable
from dataclasses import dataclass
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import FunctionTransformer
class DataframeTransformer():
def __init__(self, func: Callable):
self.func = func
def transform(self, input_df: pd.DataFrame, **transform_params) -> pd.DataFrame:
return self.func(input_df)
def fit(self, X, y=None, **fit_params):
return self
def custom_scaling(df: pd.DataFrame) -> Any:
return (df-df.min())/(df.max()-df.min())
@dataclass
class Analyze():
input_path: str
output_path: str
def create_pipe(self):
return Pipeline([
("step1", SimpleImputer(strategy="median")),
("step2", DataframeTransformer(custom_scaling)),
# ("step2", MinMaxScaler()),
("step3", FunctionTransformer(np.log1p))
])
def run(self):
df = pd.read_csv(self.input_path)
df.loc[:,:] = self.create_pipe().fit_transform(df)
df.to_csv(self.output_path)
analyze = Analyze(
input_path="path/to/input/data",
output_path="path/to/output/data"
)
analyze.run()