Skip to content

Instantly share code, notes, and snippets.

@RahulDas-dev
Created July 31, 2023 20:38
Show Gist options
  • Save RahulDas-dev/1c1c2300a44ce157bc84e0062065af3e to your computer and use it in GitHub Desktop.
Save RahulDas-dev/1c1c2300a44ce157bc84e0062065af3e to your computer and use it in GitHub Desktop.
A Simple Scikit-learn Pipeline

Loading dataset

import pandas as pd
from sklearn.datasets import  fetch_openml

x = fetch_openml(data_id=1461, as_frame=True, parser='pandas') 
dataset = x['frame']
print(f'dataset Shape {dataset.shape}')
dataset.head()

image

train test Split

from sklearn.model_selection import train_test_split

target = dataset.pop('Class')


X_train, X_test, y_train, y_test = train_test_split(dataset, target, test_size=0.33, random_state=42,stratify=target )

print(f'X_train shape {X_train.shape} y_train shape {y_train.shape}')
print(f'X_test shape  {X_test.shape}  y_test shape {y_test.shape}')
X_train shape (30291, 16) y_train shape (30291,)
X_test shape  (14920, 16)  y_test shape (14920,)

Building Custom Imputer and Transformer

import numpy as np
import warnings

from sklearn.preprocessing import FunctionTransformer
from sklearn.utils import check_array, check_X_y, estimator_checks
from sklearn.utils.validation import check_is_fitted

from sklearn.base import TransformerMixin, BaseEstimator


def from_bool_to_number(x: np.ndarray) -> np.ndarray:
    return np.where(x, 1.0, 0.0)


def from_number_to_bool(x: np.ndarray) -> np.ndarray:
    return np.where(x > 0.5, True, False)


BooleanTransformer = FunctionTransformer(from_bool_to_number, from_number_to_bool, check_inverse=False)


class ColumnsGuard(TransformerMixin, BaseEstimator):
    """
    Verify column names at predict time match the ones used when fitting

    """

    def fit(self, X, y=None):
        X_out, y = check_X_y(X, y)
        self.expected_ = list(X.columns)
        return self

    def transform(self, X):
        check_is_fitted(self)
        X_out = check_array(X)
        columns_got = list(X.columns)
        missing = set(self.expected_) - set(columns_got)
        extra = set(columns_got) - set(self.expected_)
        if missing:
            raise ValueError('Missing columns: {missing}')
        elif extra:
            warnings.warn(f'Got extra columns: {extra}, ignoring')
            return X[self.expected_]
        else:
            return X
            

class BooleanImputer(TransformerMixin, BaseEstimator):

    def fit(self, X, y=None):
        # validate and convert if possible:
        X = check_array(X, force_all_finite=False)
        _, counts = np.unique(X, return_counts=True)
        ind = np.argmax(counts)
        self.fill_val_ = X[ind]
        return self

    def transform(self, X):
        X = check_array(X, force_all_finite=False)
        return np.where(X==np.nan, self.fill_val_, X)  

Building Pipeline

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.preprocessing import OneHotEncoder, StandardScaler, FunctionTransformer
from sklearn.compose import make_column_selector as column_selector

from sklearn.neighbors import KNeighborsClassifier

pipeline = Pipeline([
    ("columnsguard", ColumnsGuard()),
    (
        'preprocessor', ColumnTransformer([
            (
                'numerical', 
                 Pipeline([
                        ('imputer', SimpleImputer(strategy='mean')),
                        #('scaler', StandardScaler())
                 ]), 
                 column_selector(dtype_include=[np.number]) 
            ),
            (
                'categorical', 
                  Pipeline([
                        ('imputer', SimpleImputer(strategy='most_frequent')),
                        ('onehot', OneHotEncoder(handle_unknown='ignore'))
                  ]), 
                  column_selector(dtype_include=[object, "category"]) 
            ),
            (
                'boolean', 
                  Pipeline([
                        ('imputer', BooleanImputer()),
                        ('onehot', FunctionTransformer(from_bool_to_number, from_number_to_bool, check_inverse=False))
                  ]), 
                  column_selector(dtype_include=bool)
            )
    
        ]) 
    ),
    ('classifier', KNeighborsClassifier() )
])

pipeline

image

Training Model

pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_test)

Saving model

import joblib
joblib.dump(pipeline, 'pipeline.pkl')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment