jantrienes/eli5_pipeline.py

## eli5_pipeline.py
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler

import eli5

X, y = fetch_20newsgroups(return_X_y=True)
X = pd.DataFrame(X, columns=['text'])
y = pd.Series(y)
X['text_len'] = X['text'].str.len()

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=42, stratify=y)

pipe = Pipeline([
    ('features', ColumnTransformer([
        ('bow', CountVectorizer(), 'text'),
        ('text_len', MinMaxScaler(), ['text_len']),
    ], remainder='drop')),
    ('clf', LogisticRegression()),
])

pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)

# Use transformers_ for the fitted estimators.
vec = pipe[0].transformers_[0][1]
# We cannot call pipe.get_feature_names() because the MinMaxScaler do not have feature names.
# Therefore, we append those manually.
feature_names = vec.get_feature_names() + ['text_len']
# Show weights of the classifier (at index 1 in pipeline)
eli5.show_weights(pipe[1], feature_names=feature_names, top=10)
	import pandas as pd
	from sklearn.compose import ColumnTransformer
	from sklearn.datasets import fetch_20newsgroups
	from sklearn.feature_extraction.text import CountVectorizer
	from sklearn.linear_model import LogisticRegression
	from sklearn.model_selection import train_test_split
	from sklearn.pipeline import Pipeline
	from sklearn.preprocessing import MinMaxScaler

	import eli5

	X, y = fetch_20newsgroups(return_X_y=True)
	X = pd.DataFrame(X, columns=['text'])
	y = pd.Series(y)
	X['text_len'] = X['text'].str.len()

	X_train, X_test, y_train, y_test = train_test_split(
	X, y, test_size=0.33, random_state=42, stratify=y)

	pipe = Pipeline([
	('features', ColumnTransformer([
	('bow', CountVectorizer(), 'text'),
	('text_len', MinMaxScaler(), ['text_len']),
	], remainder='drop')),
	('clf', LogisticRegression()),
	])

	pipe.fit(X_train, y_train)
	y_pred = pipe.predict(X_test)

	# Use transformers_ for the fitted estimators.
	vec = pipe[0].transformers_[0][1]
	# We cannot call pipe.get_feature_names() because the MinMaxScaler do not have feature names.
	# Therefore, we append those manually.
	feature_names = vec.get_feature_names() + ['text_len']
	# Show weights of the classifier (at index 1 in pipeline)
	eli5.show_weights(pipe[1], feature_names=feature_names, top=10)