Skip to content

Instantly share code, notes, and snippets.

@oliver-batey
Last active March 22, 2022 16:40
Show Gist options
  • Save oliver-batey/1d804ef3c838a285ba4e37d30b9bc284 to your computer and use it in GitHub Desktop.
Save oliver-batey/1d804ef3c838a285ba4e37d30b9bc284 to your computer and use it in GitHub Desktop.
Pipeline example
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
# Define the names of the text and numerical features
text_features = "text"
numerical_features = ["n_words", "mean_word_length"]
# Create the initial pipeline which generates numerical columns
pipeline_1 = Pipeline(
[("n_words", CountWords("n_words")), ("mean_length", MeanWordLength("text"))]
)
# Then use ColumnTransformer to process the numerical columns and the text column separately.
# We define and apply num_pipeline to the numerical columns and CountVectorizer to the text column
num_pipeline = Pipeline(
[("selector", FeatureSelector(numerical_features)), ("imp", SimpleImputer())]
)
pipeline_2 = ColumnTransformer(
[
("txt", CountVectorizer(), "text"),
("num", num_pipeline, ["n_words", "mean_word_length"]),
]
)
# Build the final pipeline using pipeline_1 and pipeline_2 and an estimator, in this case SVC()
pipeline = Pipeline(
[("add_numerical", pipeline_1), ("transform", pipeline_2), ("clf", SVC())]
)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment