Skip to content

Instantly share code, notes, and snippets.

@jmquintana79
Last active July 8, 2024 11:55
Show Gist options
  • Save jmquintana79/0b194c83b7e693a07dbee302cc77c749 to your computer and use it in GitHub Desktop.
Save jmquintana79/0b194c83b7e693a07dbee302cc77c749 to your computer and use it in GitHub Desktop.
References: - How to Create Pipelines in Scikit-learn for More Efficient Data Processing: https://www.statology.org/how-create-pipelines-scikit-learn-for-more-efficient-data-processing/
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
# example models and preprocessors
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
# X, y
# Numerical features preprocessing
numerical_features = ['age', 'income']
numerical_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='median')),
('scaler', StandardScaler())
])
# Categorical features preprocessing
categorical_features = ['gender', 'occupation']
categorical_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='most_frequent')),
('onehot', OneHotEncoder(handle_unknown='ignore'))
])
# Combine preprocessing for numerical and categorical features
preprocessor = ColumnTransformer(
transformers=[
('num', numerical_transformer, numerical_features),
('cat', categorical_transformer, categorical_features)
]
)
# Create the pipeline with ML
pipeline = Pipeline(steps=[
('preprocessor', preprocessor),
('classifier', LogisticRegression())
])
# Fit preprocessor + model
pipeline.fit(X, y)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment