oliver-batey/pipeline.py

## pipeline.py
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC

# Define the names of the text and numerical features
text_features = "text"
numerical_features = ["n_words", "mean_word_length"]

# Create the initial pipeline which generates numerical columns
pipeline_1 = Pipeline(
    [("n_words", CountWords("n_words")), ("mean_length", MeanWordLength("text"))]
)

# Then use ColumnTransformer to process the numerical columns and the text column separately.
# We define and apply num_pipeline to the numerical columns and CountVectorizer to the text column
num_pipeline = Pipeline(
    [("selector", FeatureSelector(numerical_features)), ("imp", SimpleImputer())]
)

pipeline_2 = ColumnTransformer(
    [
        ("txt", CountVectorizer(), "text"),
        ("num", num_pipeline, ["n_words", "mean_word_length"]),
    ]
)

# Build the final pipeline using pipeline_1 and pipeline_2 and an estimator, in this case SVC()
pipeline = Pipeline(
    [("add_numerical", pipeline_1), ("transform", pipeline_2), ("clf", SVC())]
)
	from sklearn.compose import ColumnTransformer
	from sklearn.feature_extraction.text import CountVectorizer
	from sklearn.impute import SimpleImputer
	from sklearn.pipeline import Pipeline
	from sklearn.svm import SVC

	# Define the names of the text and numerical features
	text_features = "text"
	numerical_features = ["n_words", "mean_word_length"]

	# Create the initial pipeline which generates numerical columns
	pipeline_1 = Pipeline(
	[("n_words", CountWords("n_words")), ("mean_length", MeanWordLength("text"))]
	)

	# Then use ColumnTransformer to process the numerical columns and the text column separately.
	# We define and apply num_pipeline to the numerical columns and CountVectorizer to the text column
	num_pipeline = Pipeline(
	[("selector", FeatureSelector(numerical_features)), ("imp", SimpleImputer())]
	)

	pipeline_2 = ColumnTransformer(
	[
	("txt", CountVectorizer(), "text"),
	("num", num_pipeline, ["n_words", "mean_word_length"]),
	]
	)

	# Build the final pipeline using pipeline_1 and pipeline_2 and an estimator, in this case SVC()
	pipeline = Pipeline(
	[("add_numerical", pipeline_1), ("transform", pipeline_2), ("clf", SVC())]
	)